diff --git a/CMakeLists.txt b/CMakeLists.txt
index 28dc39920c6d07..250907a020cc6e 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -284,6 +284,27 @@ if(WITH_GPU)
     endif()
 endif()
 
+if(WITH_ROCM)
+    include(hip)
+    include(miopen) # set miopen libraries, must before configure
+endif(WITH_ROCM)
+
+if (NOT WITH_ROCM AND WITH_RCCL)
+    MESSAGE(WARNING
+        "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
+    set(WITH_RCCL OFF CACHE STRING
+        "Disable RCCL when compiling without ROCM" FORCE)
+endif()
+
+if(WITH_RCCL)
+     add_definitions("-DPADDLE_WITH_RCCL")
+     include(rccl)
+else()
+     if(WITH_ROCM)
+         MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.")
+     endif()
+endif()
+
 include(third_party)  # download, build, install third_party, Contains about 20+ dependencies
 
 include(flags)              # set paddle compile flags
@@ -308,26 +329,6 @@ include(configure)          # add paddle env configuration
 
 include_directories("${PADDLE_SOURCE_DIR}")
 
-if(WITH_ROCM)
-    include(hip)
-endif(WITH_ROCM)
-
-if (NOT WITH_ROCM AND WITH_RCCL)
-    MESSAGE(WARNING
-        "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
-    set(WITH_RCCL OFF CACHE STRING
-        "Disable RCCL when compiling without ROCM" FORCE)
-endif()
-
-if(WITH_RCCL)
-     add_definitions("-DPADDLE_WITH_RCCL")
-     include(rccl)
-else()
-     if(WITH_ROCM)
-         MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.")
-     endif()
-endif()
-
 if(WITH_NV_JETSON)
     set(WITH_ARM ON CACHE STRING "Set WITH_ARM=ON when compiling WITH_NV_JETSON=ON." FORCE)
 endif()
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 6056b53bc2218f..8e762be646acb8 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -69,15 +69,21 @@ if(NOT DEFINED CBLAS_PROVIDER)
     PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
 
   if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB)
-    set(CBLAS_PROVIDER OPENBLAS)
-    set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
-    set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
-
-    add_definitions(-DPADDLE_USE_OPENBLAS)
-    add_definitions(-DLAPACK_FOUND)
-
-    message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-    message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
+    file(READ "${OPENBLAS_INC_DIR}/openblas_config.h" config_file)
+    string(REGEX MATCH "OpenBLAS ([0-9]+\.[0-9]+\.[0-9]+)" tmp ${config_file})
+    string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" ver ${tmp})
+    
+    if (${ver} VERSION_EQUAL "0.3.7")
+      set(CBLAS_PROVIDER OPENBLAS)
+      set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
+      set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
+
+      add_definitions(-DPADDLE_USE_OPENBLAS)
+      add_definitions(-DLAPACK_FOUND)
+
+      message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+      message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
+    endif()
   endif()
 endif()
 
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e7f125269be1f5..458ab992c25f38 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -143,6 +143,14 @@ elseif(WITH_ROCM)
     add_definitions(-DPADDLE_WITH_HIP)
     add_definitions(-DEIGEN_USE_GPU)
     add_definitions(-DEIGEN_USE_HIP)
+
+    if(NOT MIOPEN_FOUND)
+        message(FATAL_ERROR "Paddle needs MIOpen to compile")
+    endif()
+
+    if(${MIOPEN_VERSION} VERSION_LESS 2090)
+        message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile")
+    endif()
 else()
     add_definitions(-DHPPL_STUB_FUNC)
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 033b40622e2594..9bdfc36201d539 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -95,11 +95,23 @@ function(select_nvcc_arch_flags out_variable)
   if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
     set(cuda_arch_bin "30 35")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
-    set(cuda_arch_bin "50")
+    if (WITH_NV_JETSON)
+      set(cuda_arch_bin "53")
+    else()
+      set(cuda_arch_bin "50")
+    endif()
   elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
-    set(cuda_arch_bin "60 61")
+    if (WITH_NV_JETSON)
+      set(cuda_arch_bin "62")
+    else()
+      set(cuda_arch_bin "60 61")
+    endif()
   elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
-    set(cuda_arch_bin "70")
+    if (WITH_NV_JETSON)
+      set(cuda_arch_bin "72")
+    else()
+      set(cuda_arch_bin "70")
+    endif()
   elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
     set(cuda_arch_bin "75")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index ce5603b24b687d..d0d3901641c934 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     ${GIT_URL}/oneapi-src/oneDNN.git)
-SET(MKLDNN_TAG            f3999b71d8e4415c1985a0dfb812a3ed77ee21fa)
+SET(MKLDNN_TAG            748528a2d3204b5f401c14a9aacdec16accd5ead)
 
 
 # Introduce variables:
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index a03ff7d22dcad2..5d1f1776f885cd 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -7,52 +7,70 @@ SET(XPU_PROJECT                 "extern_xpu")
 SET(XPU_API_LIB_NAME            "libxpuapi.so")
 SET(XPU_RT_LIB_NAME             "libxpurt.so")
 
-if(NOT XPU_SDK_ROOT)
-  if (WITH_AARCH64)
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/aarch64/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
-  elseif(WITH_SUNWAY)
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE)
-  else()
-      SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_05_19.tar.gz" CACHE STRING "" FORCE)
-  endif()
-
-  SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
-  SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
-  SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
-  SET(XPU_API_INC_DIR             "${THIRD_PARTY_PATH}/install/xpu/include")
-  SET(XPU_LIB_DIR                 "${THIRD_PARTY_PATH}/install/xpu/lib")
-
-  SET(XPU_API_LIB                 "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
-  SET(XPU_RT_LIB                  "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
-
-  SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
-
-  FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
-    "PROJECT(XPU)\n"
-    "cmake_minimum_required(VERSION 3.0)\n"
-    "install(DIRECTORY xpu/include xpu/lib \n"
-    "        DESTINATION ${XPU_INSTALL_DIR})\n")
-
-  ExternalProject_Add(
-      ${XPU_PROJECT}
-      ${EXTERNAL_PROJECT_LOG_ARGS}
-      PREFIX                ${XPU_SOURCE_DIR}
-      DOWNLOAD_DIR          ${XPU_DOWNLOAD_DIR}
-      DOWNLOAD_COMMAND      wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
-                            && tar xvf xpu.tar.gz
-      DOWNLOAD_NO_PROGRESS  1
-      UPDATE_COMMAND        ""
-      CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
-      CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
-  )
-else()
-  SET(XPU_API_INC_DIR   "${XPU_SDK_ROOT}/XTDK/include/")
-  SET(XPU_API_LIB "${XPU_SDK_ROOT}/XTDK/shlib/libxpuapi.so")
-  SET(XPU_RT_LIB "${XPU_SDK_ROOT}/XTDK/runtime/shlib/libxpurt.so")
-  SET(XPU_LIB_DIR "${XPU_SDK_ROOT}/XTDK/shlib/")
-endif()
+IF(WITH_AARCH64)
+  SET(XPU_XRE_DIR_NAME "xre-kylin_aarch64")
+  SET(XPU_XDNN_DIR_NAME "xdnn-kylin_aarch64")
+  SET(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64")
+ELSEIF(WITH_SUNWAY)
+  SET(XPU_XRE_DIR_NAME "xre-deepin_sw6_64")
+  SET(XPU_XDNN_DIR_NAME "xdnn-deepin_sw6_64")
+  SET(XPU_XCCL_DIR_NAME "xccl-deepin_sw6_64")
+ELSEIF(WITH_BDCENTOS)
+  SET(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64")
+  SET(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64")
+  SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
+ELSEIF(WITH_UBUNTU)
+  SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
+  SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
+  SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
+ELSEIF(WITH_CENTOS)
+  SET(XPU_XRE_DIR_NAME "xre-centos7_x86_64")
+  SET(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64")
+  SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
+ELSE ()
+  SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
+  SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
+  SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
+ENDIF()
+
+SET(XPU_BASE_URL "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev/20210527")
+SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_XCCL_URL "${XPU_BASE_URL}/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE)
+
+SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
+SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
+SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
+SET(XPU_INC_DIR                 "${THIRD_PARTY_PATH}/install/xpu/include")
+SET(XPU_LIB_DIR                 "${THIRD_PARTY_PATH}/install/xpu/lib")
+
+SET(XPU_API_LIB                 "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
+SET(XPU_RT_LIB                  "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
+
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
+
+FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(XPU)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY xpu/include xpu/lib \n"
+  "        DESTINATION ${XPU_INSTALL_DIR})\n")
+
+ExternalProject_Add(
+    ${XPU_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${XPU_SOURCE_DIR}
+    DOWNLOAD_DIR          ${XPU_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget ${XPU_PACK_DEPENCE_URL}
+                          && bash pack_paddle_depence.sh ${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XDNN_URL} ${XPU_XDNN_DIR_NAME} ${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME}
+
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
+)
 
-INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
+INCLUDE_DIRECTORIES(${XPU_INC_DIR})
 ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
 set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
 
@@ -62,7 +80,7 @@ generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
 
 TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
 
-if (WITH_XPU_BKCL)
+IF(WITH_XPU_BKCL)
   MESSAGE(STATUS "Compile with XPU BKCL!")
   ADD_DEFINITIONS(-DPADDLE_WITH_XPU_BKCL)
 
@@ -71,9 +89,9 @@ if (WITH_XPU_BKCL)
   SET(XPU_BKCL_INC_DIR          "${THIRD_PARTY_PATH}/install/xpu/include")
   INCLUDE_DIRECTORIES(${XPU_BKCL_INC_DIR})
   TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB})
-else(WITH_XPU_BKCL)
-  TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} )
-endif(WITH_XPU_BKCL)
+ELSE(WITH_XPU_BKCL)
+  TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
+ENDIF(WITH_XPU_BKCL)
 
 if(NOT XPU_SDK_ROOT)
   ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 84ab072ddcf9a7..8a18fa4a5512b3 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -320,12 +320,18 @@ function(version version_file)
             "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
             "WITH_MKL: ${WITH_MKL}\n"
             "WITH_MKLDNN: ${WITH_MKLDNN}\n"
-            "WITH_GPU: ${WITH_GPU}\n")
+            "WITH_GPU: ${WITH_GPU}\n"
+            "WITH_ROCM: ${WITH_ROCM}\n")
     if(WITH_GPU)
         file(APPEND ${version_file}
                 "CUDA version: ${CUDA_VERSION}\n"
                 "CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n")
     endif()
+    if(WITH_ROCM)
+        file(APPEND ${version_file}
+                "HIP version: ${HIP_VERSION}\n"
+                "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n")
+    endif()
     file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n")
     if(TENSORRT_FOUND)
         file(APPEND ${version_file}
diff --git a/cmake/miopen.cmake b/cmake/miopen.cmake
new file mode 100644
index 00000000000000..f482f423dc5c12
--- /dev/null
+++ b/cmake/miopen.cmake
@@ -0,0 +1,67 @@
+if(NOT WITH_ROCM)
+    return()
+endif()
+
+# Now we don't support ROCm on windows
+if(WIN32)
+    return()
+endif()
+
+set(MIOPEN_ROOT ${ROCM_PATH}/miopen CACHE PATH "MIOPEN ROOT")
+
+find_path(MIOPEN_INCLUDE_DIR "miopen/miopen.h"
+    PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/include ${MIOPEN_ROOT}/local/include
+          $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/include $ENV{MIOPEN_ROOT}/local/include
+          NO_DEFAULT_PATH
+)
+
+get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
+
+find_library(MIOPEN_LIBRARY NAMES "libMIOpen.so"
+    PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/lib ${MIOPEN_ROOT}/lib64 ${__libpath_hist} 
+          $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/lib $ENV{MIOPEN_ROOT}/lib64 
+          NO_DEFAULT_PATH 
+    DOC "Path to MIOpen library.")
+
+if(MIOPEN_INCLUDE_DIR AND MIOPEN_LIBRARY)
+    set(MIOPEN_FOUND ON)
+else()
+    set(MIOPEN_FOUND OFF)
+endif()
+
+macro(find_miopen_version miopen_header_file) 
+    file(READ ${miopen_header_file} MIOPEN_VERSION_FILE_CONTENTS)
+    get_filename_component(MIOPEN_LIB_PATH ${MIOPEN_LIBRARY} DIRECTORY)
+
+    string(REGEX MATCH "define MIOPEN_VERSION_MAJOR +([0-9]+)" MIOPEN_MAJOR_VERSION
+        "${MIOPEN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR +([0-9]+)" "\\1"
+        MIOPEN_MAJOR_VERSION "${MIOPEN_MAJOR_VERSION}")
+    string(REGEX MATCH "define MIOPEN_VERSION_MINOR +([0-9]+)" MIOPEN_MINOR_VERSION
+        "${MIOPEN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MIOPEN_VERSION_MINOR +([0-9]+)" "\\1"
+        MIOPEN_MINOR_VERSION "${MIOPEN_MINOR_VERSION}")
+    string(REGEX MATCH "define MIOPEN_VERSION_PATCH +([0-9]+)" MIOPEN_PATCH_VERSION
+        "${MIOPEN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MIOPEN_VERSION_PATCH +([0-9]+)" "\\1"
+        MIOPEN_PATCH_VERSION "${MIOPEN_PATCH_VERSION}")
+    string(REGEX MATCH "define MIOPEN_VERSION_TWEAK +([0-9]+)" MIOPEN_TWEAK_VERSION
+        "${MIOPEN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MIOPEN_VERSION_TWEAK +([0-9]+)" "\\1"
+        MIOPEN_TWEAK_VERSION "${MIOPEN_TWEAK_VERSION}")
+
+    if(NOT MIOPEN_MAJOR_VERSION)
+        set(MIOPEN_VERSION "???")
+    else()
+        add_definitions("-DMIOPEN_MAJOR_VERSION=\"${MIOPEN_MAJOR_VERSION}\"")
+        math(EXPR MIOPEN_VERSION
+            "${MIOPEN_MAJOR_VERSION} * 1000 +
+             ${MIOPEN_MINOR_VERSION} * 10 + ${MIOPEN_PATCH_VERSION}")
+        message(STATUS "Current MIOpen header is ${MIOPEN_INCLUDE_DIR}/miopen/miopen.h "
+          "Current MIOpen version is v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}.${MIOPEN_PATCH_VERSION}. ")
+    endif()
+endmacro()
+
+if(MIOPEN_FOUND)
+  find_miopen_version(${MIOPEN_INCLUDE_DIR}/miopen/version.h) 
+endif()
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index d33edef38ca7b3..e3a78d3cf3bfe0 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -215,6 +215,8 @@ list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boos
 list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool)
 
 include(cblas)              	# find first, then download, build, install openblas
+
+message(STATUS "CBLAS_PROVIDER: ${CBLAS_PROVIDER}")
 if(${CBLAS_PROVIDER} STREQUAL MKLML)
     list(APPEND third_party_deps extern_mklml)
 elseif(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 8d1ae4926a8012..c06260b72e6ee7 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -27,7 +27,22 @@ add_subdirectory(fleet)
 add_subdirectory(io)
 #ddim lib
 proto_library(framework_proto SRCS framework.proto)
+
 proto_library(op_def_proto SRCS op_def.proto)
+cc_library(op_def_api SRCS op_def_api.cc DEPS op_def_proto)
+
+FILE(GLOB OP_DEF_FILES ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/compat/*.pbtxt)
+FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt 
+     "namespace { \n"
+     "const std::unordered_map<std::string, std::string> op_def_map =  { \n")
+foreach(OP_DEF_FILE ${OP_DEF_FILES})
+    FILE(READ ${OP_DEF_FILE}  OP_DEF_CONTENT)
+    get_filename_component(OP_NAME ${OP_DEF_FILE} NAME_WE)
+    FILE(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt
+    "{\"${OP_NAME}\",R\"(${OP_DEF_CONTENT})\"},\n")
+endforeach(OP_DEF_FILE)
+FILE(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt "{\"\",\"\"}};\n}")
+
 proto_library(heter_service_proto SRCS heter_service.proto)
 proto_library(data_feed_proto SRCS data_feed.proto)
 proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
diff --git a/paddle/fluid/framework/details/nan_inf_utils.h b/paddle/fluid/framework/details/nan_inf_utils.h
index 4d7d9afe701929..cf64ccd60f45a4 100644
--- a/paddle/fluid/framework/details/nan_inf_utils.h
+++ b/paddle/fluid/framework/details/nan_inf_utils.h
@@ -19,6 +19,7 @@
 
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -30,9 +31,28 @@ void CheckVarHasNanOrInf(const std::string& op_type,
                          const std::string& var_name,
                          const platform::Place& place);
 
+void CheckVarHasNanOrInf(const std::string& op_type,
+                         const std::string& var_name,
+                         const framework::Variable* var,
+                         const platform::Place& place);
+
 void CheckOpHasNanOrInf(const framework::OperatorBase& op,
                         const framework::Scope& scope,
                         const platform::Place& place);
+
+template <typename VarType>
+void CheckOpHasNanOrInfInDygraph(const std::string& op_type,
+                                 const imperative::NameVarMap<VarType>& op_outs,
+                                 platform::Place place) {
+  for (const auto& pair : op_outs) {
+    for (const auto& ivar : pair.second) {
+      auto* var = ivar->MutableVar();
+      if (var == nullptr) continue;
+      CheckVarHasNanOrInf(op_type, ivar->Name(), var, place);
+    }
+  }
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index f9aa14bf7e8d7e..30231a1799fd37 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -297,13 +297,12 @@ void tensor_check<platform::CPUDeviceContext>(const std::string& op_type,
 }
 
 void CheckVarHasNanOrInf(const std::string& op_type,
-                         const framework::Scope& scope,
                          const std::string& var_name,
+                         const framework::Variable* var,
                          const platform::Place& place) {
-  auto* var = scope.FindVar(var_name);
   PADDLE_ENFORCE_NOT_NULL(
-      var, platform::errors::NotFound("In op=%s, can't find var:%s", op_type,
-                                      var_name));
+      var, platform::errors::NotFound("Cannot find var: `%s` in op `%s`.",
+                                      var_name, op_type));
 
   const Tensor* tensor{nullptr};
   if (var->IsType<framework::LoDTensor>()) {
@@ -393,6 +392,14 @@ void CheckVarHasNanOrInf(const std::string& op_type,
   tensor_check<platform::CPUDeviceContext>(op_type, var_name, *tensor, place);
 }
 
+void CheckVarHasNanOrInf(const std::string& op_type,
+                         const framework::Scope& scope,
+                         const std::string& var_name,
+                         const platform::Place& place) {
+  auto* var = scope.FindVar(var_name);
+  CheckVarHasNanOrInf(op_type, var_name, var, place);
+}
+
 bool IsSkipOp(const framework::OperatorBase& op) {
   if (op_type_nan_inf_white_list().count(op.Type()) != 0) return true;
 
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 181e3b68853801..be05941efb5b4b 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -176,6 +176,7 @@ message DistributedStrategy {
   optional bool find_unused_parameters = 28 [ default = false ];
   optional bool tensor_parallel = 29 [ default = false ];
   optional bool without_graph_optimization = 30 [ default = false ];
+  optional int32 fuse_grad_size_in_num = 31 [ default = 1 ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index fb478bb6e8978e..16dfc90d27e6a6 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -50,7 +50,7 @@ if (WITH_TESTING)
 endif(WITH_TESTING)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS ${GRAPH_PATTERN_DETECTOR_DEPS})
 
-cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector)
+cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector op_def_api)
 cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS graph_pattern_detector executor)
 cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS op_compat_sensible_pass)
 cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass)
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index bc1be79d1b1688..0bb2782b3737ee 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
-
 #include <string>
 
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -23,6 +22,67 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+FCFusePass::FCFusePass() {
+  AddOpCompat(OpCompat("mul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("x_num_col_dims")
+      .IsNumGE(1)
+      .End()
+      .AddAttr("y_num_col_dims")
+      .IsNumEQ(1)
+      .End();
+
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsNumGE(1)
+      .End();
+
+  AddOpCompat(OpCompat("relu"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+
+  AddOpCompat(OpCompat("fc"))
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("W")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("in_num_col_dims")
+      .IsNumGE(1)
+      .End()
+      .AddAttr("activation_type")
+      .IsStringIn({"relu", ""})
+      .End();
+}
+
 void FCFusePass::ApplyImpl(ir::Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
@@ -52,6 +112,10 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
       LOG(WARNING) << "The subgraph is empty.";
       return;
     }
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING) << "Pass in op compat failed.";
+      return;
+    }
 
     VLOG(4) << "handle FC fuse";
     GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
@@ -159,6 +223,11 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
     }
     desc.Flush();
 
+    if (!IsCompat(desc)) {
+      LOG(WARNING) << "Fc fuse pass in out fc op compat failed.";
+      return;
+    }
+
     auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
     if (with_relu) {
       GraphSafeRemoveNodes(
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h
index f564bbb151854f..21ef17b65dc2cb 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
@@ -30,6 +30,7 @@ class Graph;
 
 class FCFusePass : public FusePassBase {
  public:
+  FCFusePass();
   virtual ~FCFusePass() {}
 
  protected:
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
index cf35c1ac772da0..5046911036818c 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -58,12 +58,12 @@ TEST(FCFusePass, basic) {
   auto* weights_0 = layers.data("weights_0", {}, true);
   auto* mul_out_0 = layers.mul(relu_out_0, weights_0);
   auto* bias_1 = layers.data("bias_1", {}, true);
-  auto* add_out_0 = layers.elementwise_add(mul_out_0, bias_1);
+  auto* add_out_0 = layers.elementwise_add(mul_out_0, bias_1, nullptr, 1);
   auto* relu_out_1 = layers.relu(add_out_0);
   auto* weights_1 = layers.data("weights_1", {}, true);
   auto* mul_out_1 = layers.mul(relu_out_1, weights_1);
   auto* bias_2 = layers.data("bias_2", {}, true);
-  auto* add_out_1 = layers.elementwise_add(mul_out_1, bias_2);
+  auto* add_out_1 = layers.elementwise_add(mul_out_1, bias_2, nullptr, 1);
   VLOG(4) << add_out_1;
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
index b056c3b07a2f65..e422a9bae31181 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <memory>
-
 #include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include "paddle/fluid/framework/op_def_api.h"
 #include "paddle/fluid/framework/op_info.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -50,18 +53,17 @@ AttrCompat& AttrCompat::IsIntIn(const std::set<int>& candidates) {
   return *this;
 }
 
-//! Todo: append the definition.
 AttrCompat& AttrCompat::IsLeftDefault() {
   const std::string& op_name = op_compat_->Name();
   if (!OpInfoMap::Instance().Has(op_name)) {
-    VLOG(3) << "Op (" << op_name << ") is not registered!";
+    LOG(WARNING) << "Op (" << op_name << ") is not registered!";
     conditions_.emplace_back([](const Attribute& attr) { return false; });
     return *this;
   }
   const OpInfo& op_info = OpInfoMap::Instance().Get(op_name);
   const AttributeMap attrs = op_info.Checker()->GetAttrsDefaultValuesMap();
   if (attrs.find(attr_name_) == attrs.end()) {
-    VLOG(3) << "Op (" << op_name << ") has no default attr:" << attr_name_;
+    LOG(WARNING) << "Op (" << op_name << ") has no default attr:" << attr_name_;
     conditions_.emplace_back([](const Attribute& attr) { return false; });
   } else {
     Attribute default_attr = attrs.at(attr_name_);
@@ -77,6 +79,10 @@ bool AttrCompat::operator()(const OpDesc& op_desc) {
     return true;
   }
   if (!op_desc.HasAttr(attr_name_)) {
+    if (!optional_) {
+      LOG(WARNING) << "The non-optional Attr(" << attr_name_ << ") of Op ("
+                   << op_compat_->Name() << ") not find ! ";
+    }
     return optional_;
   }
   const Attribute attr = op_desc.GetAttr(attr_name_);
@@ -149,19 +155,35 @@ InputOrOutputCompat& OpCompat::AddOutput(const std::string& name) {
 }
 
 bool OpCompat::Judge(const OpDesc& op_desc) {
+  if (is_first_judge_) {
+    is_first_judge_ = false;
+    const proto::OpDef& op_def = GetOpDef(op_name_);
+    if (op_def.has_extra()) {
+      for (const proto::OpDef_AttrDef& attr : op_def.extra().attrs()) {
+        extra_attrs_.emplace(attr.name());
+      }
+    }
+  }
+
   for (auto& attr_map : op_desc.GetAttrMap()) {
     if (attr_compats_.find(attr_map.first) == attr_compats_.end()) {
+      if (extra_attrs_.find(attr_map.first) != extra_attrs_.end()) {
+        continue;
+      }
       if (!AttrCompat(attr_map.first, this).IsLeftDefault()(op_desc)) {
-        VLOG(3) << "The Attr(" << attr_map.first << ") of Op (" << op_name_
-                << ") not reigistered in OpCompat, not equal to default value!";
+        LOG(WARNING)
+            << "The Attr(" << attr_map.first << ") of Op (" << op_name_
+            << ") not reigistered in OpCompat, not in extra attribute, not "
+               "equal to default value!";
         return false;
       }
     }
   }
+
   for (auto& attr_compat : attr_compats_) {
     if (!attr_compat.second(op_desc)) {
-      VLOG(3) << " Check the Attr(" << attr_compat.first << ") of Op("
-              << op_name_ << ") failed!";
+      LOG(WARNING) << " Check the Attr(" << attr_compat.first << ") of Op("
+                   << op_name_ << ") failed!";
       return false;
     }
   }
@@ -170,8 +192,8 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
   for (auto& input_desc : inputs_map) {
     if (input_compats_.find(input_desc.first) == input_compats_.end()) {
       if (!input_desc.second.empty()) {
-        VLOG(3) << "The Input (" << input_desc.first << ") of Operator ("
-                << op_name_ << ") not reigistered in OpCompat!";
+        LOG(WARNING) << "The Input (" << input_desc.first << ") of Operator ("
+                     << op_name_ << ") not reigistered in OpCompat!";
         return false;
       }
     }
@@ -179,14 +201,15 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
   for (auto& input_val : input_compats_) {
     if (inputs_map.find(input_val.first) == inputs_map.end()) {
       if (!input_val.second.Optional()) {
-        VLOG(3) << "The No optional Input (" << input_val.first
-                << ") of Operator (" << op_name_ << ") not find in op_desc!";
+        LOG(WARNING) << "The No optional Input (" << input_val.first
+                     << ") of Operator (" << op_name_
+                     << ") not find in op_desc!";
         return false;
       }
     } else {
       if (!input_val.second(inputs_map.at(input_val.first))) {
-        VLOG(3) << "The Input (" << input_val.first << ") of Operator ("
-                << op_name_ << ") compat check failed!";
+        LOG(WARNING) << "The Input (" << input_val.first << ") of Operator ("
+                     << op_name_ << ") compat check failed!";
         return false;
       }
     }
@@ -196,8 +219,8 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
   for (auto& output_desc : outputs_map) {
     if (output_compats_.find(output_desc.first) == output_compats_.end()) {
       if (!output_desc.second.empty()) {
-        VLOG(3) << "The Output (" << output_desc.first << ") of Operator ("
-                << op_name_ << ") not reigistered in OpCompat!";
+        LOG(WARNING) << "The Output (" << output_desc.first << ") of Operator ("
+                     << op_name_ << ") not reigistered in OpCompat!";
         return false;
       }
     }
@@ -205,14 +228,15 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
   for (auto& output_val : output_compats_) {
     if (outputs_map.find(output_val.first) == outputs_map.end()) {
       if (!output_val.second.Optional()) {
-        VLOG(3) << "The No optional Output (" << output_val.first
-                << ") of Operator (" << op_name_ << ") not find in op_desc!";
+        LOG(WARNING) << "The No optional Output (" << output_val.first
+                     << ") of Operator (" << op_name_
+                     << ") not find in op_desc!";
         return false;
       }
     } else {
       if (!output_val.second(outputs_map.at(output_val.first))) {
-        VLOG(3) << "The Output (" << output_val.first << ") of Operator ("
-                << op_name_ << ") compat check failed!";
+        LOG(WARNING) << "The Output (" << output_val.first << ") of Operator ("
+                     << op_name_ << ") compat check failed!";
         return false;
       }
     }
@@ -226,6 +250,32 @@ OpCompat& OpCompatSensiblePass::AddOpCompat(OpCompat&& op_compat) {
   return *(op_compat_judgers_[name]);
 }
 
+//! Tell the Op compability of a subgraph.
+bool OpCompatSensiblePass::IsCompat(
+    const GraphPatternDetector::subgraph_t& subgraph, Graph*) const {
+  PADDLE_ENFORCE_EQ(op_compat_judgers_.empty(), false,
+                    platform::errors::InvalidArgument(
+                        "At least one OpCompat instance should be added"));
+  // Check the all the ops in the subgraph are contained in the
+  // op_compat.
+  for (auto& node_pair : subgraph) {
+    if (!node_pair.second->IsOp()) continue;
+    auto op_type = node_pair.second->Op()->Type();
+    if (!op_compat_judgers_.count(op_type)) {
+      if (HasOpDef(op_type)) {
+        LOG(WARNING) << op_type << "compat not registered!";
+        return false;
+      }
+      continue;
+    }
+    auto& judger = *op_compat_judgers_.at(op_type);
+    if (!judger.Judge(*(node_pair.second->Op()))) {
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.h b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
index 3f2ea673d879b8..7346ca3756f361 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass.h
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
@@ -140,6 +140,8 @@ class OpCompat {
   std::unordered_map<std::string, AttrCompat> attr_compats_;
   std::unordered_map<std::string, InputOrOutputCompat> input_compats_;
   std::unordered_map<std::string, InputOrOutputCompat> output_compats_;
+  std::unordered_set<std::string> extra_attrs_;
+  bool is_first_judge_ = true;
 };
 
 /**
@@ -193,25 +195,7 @@ class OpCompatSensiblePass : public Pass {
 
   //! Tell the Op compability of a subgraph.
   bool IsCompat(const GraphPatternDetector::subgraph_t& subgraph,
-                Graph* g) const {
-    CHECK(!op_compat_judgers_.empty())
-        << "At least one OpCompat instance should be added in the "
-           "OpCompatSensiblePass.";
-    // Check the all the ops in the subgraph are contained in the
-    // op_compat.
-    for (auto& node_pair : subgraph) {
-      if (!node_pair.second->IsOp()) continue;
-      auto op_type = node_pair.second->Op()->Type();
-      if (!op_compat_judgers_.count(op_type)) {
-        return false;
-      }
-      auto& judger = *op_compat_judgers_.at(op_type);
-      if (!judger.Judge(*(node_pair.second->Op()))) {
-        return false;
-      }
-    }
-    return true;
-  }
+                Graph* g) const;
 
   //! Tell the op compatibility of a single Op.
   bool IsCompat(const OpDesc& op_desc) const {
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
index 0878e4d9890d35..9074a9876f9f7d 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
@@ -27,7 +27,6 @@ TEST(OpCompatSensiblePass, compatOp) {
   compat.AddAttr("in_num_col_dims")
       .IsIntIn({1, 2})
       .IsNumLE(1)
-      .IsLeftDefault()
       .End()
       .AddAttr("activation_type")
       .IsStringIn({"tanh", "sigmoid"})
@@ -68,7 +67,7 @@ TEST(OpCompatSensiblePass, compatOp) {
   fc_op.SetOutput("Out", std::vector<std::string>{"test_output"});
 
   EXPECT_STREQ(compat.Name().c_str(), "fc");
-  EXPECT_FALSE(compat.Judge(fc_op));
+  EXPECT_TRUE(compat.Judge(fc_op));
 }
 
 TEST(OpCompatSensiblePass, compatOpAttribute) {
@@ -92,6 +91,18 @@ TEST(OpCompatSensiblePass, compatOpAttribute) {
   delete info.checker_;
 }
 
+TEST(OpCompatSensiblePass, opDefNotFound) {
+  OpCompat compat("fc_1");
+
+  OpDesc fc_op;
+
+  compat.Judge(fc_op);
+
+  OpCompat compat_1("");
+
+  compat_1.Judge(fc_op);
+}
+
 TEST(OpCompatSensiblePass, compatOpAttributeOptional) {
   OpCompat compat("fc");
   compat.AddAttr("activation_type")
@@ -140,6 +151,10 @@ class OpCompatSensiblePassTest : public OpCompatSensiblePass {
  public:
   OpCompatSensiblePassTest();
   bool TestIsCompat(const OpDesc& op_desc) { return IsCompat(op_desc); }
+  bool TestIsCompat(const GraphPatternDetector::subgraph_t& subgraph,
+                    Graph* g) {
+    return IsCompat(subgraph, g);
+  }
 };
 
 OpCompatSensiblePassTest::OpCompatSensiblePassTest() {
@@ -181,6 +196,23 @@ TEST(OpCompatSensiblePass, IsCompat) {
   EXPECT_TRUE(test.TestIsCompat(fc_op));
 }
 
+TEST(OpCompatSensiblePass, IsCompatFail) {
+  OpCompatSensiblePassTest test;
+  GraphPatternDetector::subgraph_t subgraph;
+  PDPattern pattern;
+  PDNode* pd_node = pattern.NewNode();
+  ProgramDesc prog;
+  Graph g(prog);
+  OpDesc fc_op;
+  fc_op.SetType("op1");
+  subgraph[pd_node] = g.CreateOpNode(&fc_op);
+  EXPECT_TRUE(test.TestIsCompat(subgraph, &g));
+
+  fc_op.SetType("mul");
+  subgraph[pd_node] = g.CreateOpNode(&fc_op);
+  EXPECT_FALSE(test.TestIsCompat(subgraph, &g));
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/pass_tester_helper.h b/paddle/fluid/framework/ir/pass_tester_helper.h
index 6b187e538d1c08..850d3dca6d0e10 100644
--- a/paddle/fluid/framework/ir/pass_tester_helper.h
+++ b/paddle/fluid/framework/ir/pass_tester_helper.h
@@ -194,14 +194,18 @@ struct Layers {
   }
 
   VarDesc* mul(VarDesc* x, VarDesc* y, VarDesc* out = nullptr,
-               int x_num_col_dims = 1) {
+               int x_num_col_dims = 1, int y_num_col_dims = 1) {
     AttributeMap attrs;
-    attrs["x_num_col_dims"] = 1;
+    attrs["x_num_col_dims"] = x_num_col_dims;
+    attrs["y_num_col_dims"] = y_num_col_dims;
     return binary_op("mul", x, y, out, &attrs);
   }
 
-  VarDesc* elementwise_add(VarDesc* x, VarDesc* y, VarDesc* out = nullptr) {
-    return binary_op("elementwise_add", x, y, out);
+  VarDesc* elementwise_add(VarDesc* x, VarDesc* y, VarDesc* out = nullptr,
+                           int axis = -1) {
+    AttributeMap attrs;
+    attrs["axis"] = axis;
+    return binary_op("elementwise_add", x, y, out, &attrs);
   }
 
   VarDesc* elementwise_mul(VarDesc* x, VarDesc* y, VarDesc* out = nullptr,
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
index bf59c140005167..4c87b63625c1f6 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -66,9 +66,13 @@ static bool IsFCWithPaddingWeights(Node* n) {
 }
 
 static bool IsParamOfFC(Node* n, const std::string& param_name) {
-  if (IsInputOfFC(n) && n->inputs.empty() &&
-      (n->Name() == n->outputs[0]->Op()->Input(param_name)[0])) {
-    return true;
+  if (IsInputOfFC(n) && n->inputs.empty()) {
+    for (auto* out : n->outputs) {
+      if (out->Op()->Type() == "fc" &&
+          n->Name() == out->Op()->Input(param_name)[0]) {
+        return true;
+      }
+    }
   }
   return false;
 }
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 0a6b5e44452fe1..69a2a6eefaf8ca 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -276,7 +276,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor) {
   SerializeToStream(os, tensor, *dev_ctx);
 }
 
-void DeserializeFromStream(std::ifstream &os, LoDTensor *tensor) {
+void DeserializeFromStream(std::istream &os, LoDTensor *tensor) {
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext *dev_ctx;
   dev_ctx = pool.Get(platform::CPUPlace());
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 6b357aba1c5f9a..7dee0f44e384d4 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -257,7 +257,7 @@ LoD ConvertToOffsetBasedLoD(const LoD& length_lod);
 
 void SerializeToStream(std::ostream& os, const LoDTensor& tensor);
 
-void DeserializeFromStream(std::ifstream& os, LoDTensor* tensor);
+void DeserializeFromStream(std::istream& os, LoDTensor* tensor);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_def_api.cc b/paddle/fluid/framework/op_def_api.cc
new file mode 100644
index 00000000000000..b950f000bb8e50
--- /dev/null
+++ b/paddle/fluid/framework/op_def_api.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined _WIN32 || defined __APPLE__
+#else
+#define _LINUX
+#endif
+#include "paddle/fluid/framework/op_def_api.h"
+#include <fstream>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#ifdef _LINUX
+#include <stdio_ext.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#endif
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/text_format.h>
+#include "glog/logging.h"
+#include "io/fs.h"
+#include "paddle/fluid/framework/op_def.pb.h"
+
+/*
+// op_def.pbtxt
+namespace {
+ const std::unordered_map<std::string, std::std::string> op_def_map = {...};
+}
+*/
+#include "paddle/fluid/framework/op_def.pbtxt"  //NOLINT
+
+namespace paddle {
+namespace framework {
+
+const proto::OpDef& GetOpDef(const std::string& op_name) {
+  static std::unordered_map<std::string, proto::OpDef> ops_definition;
+  static std::mutex mtx;
+  if (ops_definition.find(op_name) == ops_definition.end()) {
+    std::lock_guard<std::mutex> lk(mtx);
+    if (ops_definition.find(op_name) == ops_definition.end()) {
+      proto::OpDef op_def;
+      if (op_def_map.find(op_name) == op_def_map.end()) {
+        LOG(WARNING) << op_name << ".pbtxt not exist!";
+      } else {
+        if (!::google::protobuf::TextFormat::ParseFromString(
+                op_def_map.at(op_name), &op_def)) {
+          LOG(WARNING) << "Failed to parse " << op_name;
+        }
+      }
+      if (op_def.type() != op_name) {
+        LOG(WARNING) << op_name << ".pbtxt has error type :" << op_def.type();
+        ops_definition.emplace(std::make_pair(op_name, proto::OpDef()));
+      } else {
+        ops_definition.emplace(std::make_pair(op_name, std::move(op_def)));
+      }
+    }
+  }
+  return ops_definition.at(op_name);
+}
+
+bool HasOpDef(const std::string& op_name) {
+  return op_def_map.find(op_name) != op_def_map.end();
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_def_api.h b/paddle/fluid/framework/op_def_api.h
new file mode 100644
index 00000000000000..1ef2254d0da361
--- /dev/null
+++ b/paddle/fluid/framework/op_def_api.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_def.pb.h"
+
+namespace paddle {
+namespace framework {
+const proto::OpDef& GetOpDef(const std::string& op_name);
+
+bool HasOpDef(const std::string& op_name);
+}
+}
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 25d430df458255..20cffaa9590196 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1531,7 +1531,12 @@ Scope* OperatorWithKernel::PrepareData(
   // the rest iterations to save the elapsed time.
   // We do not support skipping PrepareData in while block, because the Op's
   // input may be changed by subsequent Ops, which may cause an error.
-  if (pre_scope_ == &scope && new_scope == nullptr) {
+
+  // For inference, ops that behind conditional branch aren't supported well,
+  // so disable prepare optimization conservatively.
+  bool force_prepare_data = HasAttr("inference_force_prepare_data") &&
+                            Attr<bool>("inference_force_prepare_data");
+  if (pre_scope_ == &scope && new_scope == nullptr && !force_prepare_data) {
     need_prepare_data_ = false;
   }
 
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 7e48d0dc5f9620..c67653953f8a76 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -121,7 +121,7 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) {
   SerializeToStream(os, selected_rows, *dev_ctx);
 }
 
-void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows) {
+void DeserializeFromStream(std::istream& os, SelectedRows* selected_rows) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext* dev_ctx;
   dev_ctx = pool.Get(platform::CPUPlace());
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index e53e3d973c5246..3e4beb9498cf77 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -175,7 +175,7 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
 
 void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows);
 
-void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows);
+void DeserializeFromStream(std::istream& os, SelectedRows* selected_rows);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 6bee3d44b2edd7..c9dffe2d76a436 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(imperative_flag SRCS flags.cc DEPS gflags)
 
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils)
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
 add_subdirectory(jit)
 cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 2a3b6424d4a14e..4a42751b1c4d5b 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -15,8 +15,11 @@
 #include "paddle/fluid/imperative/prepared_operator.h"
 
 #include "paddle/fluid/framework/data_type_transform.h"
+#include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
 
+DECLARE_bool(check_nan_inf);
+
 namespace paddle {
 namespace imperative {
 
@@ -175,6 +178,11 @@ static void PreparedOpRunImpl(
   func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs,
                                         attrs));
 
+  if (FLAGS_check_nan_inf) {
+    framework::details::CheckOpHasNanOrInfInDygraph<VarType>(
+        op.Type(), outs, dev_ctx->GetPlace());
+  }
+
   /**
    * [ Why need handle complex gradient to real gradient? ]
    *
diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h
index de5f9d75e9173a..1baf73ab3b95da 100644
--- a/paddle/fluid/imperative/py_layer_fwd.h
+++ b/paddle/fluid/imperative/py_layer_fwd.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/prepared_operator.h"
 #include "paddle/fluid/imperative/tracer.h"
 
 #include "paddle/fluid/framework/op_registry.h"
@@ -32,7 +33,17 @@ bool RequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
   for (const auto& name_pair : ins) {
     for (const auto& var_base : name_pair.second) {
       if (!var_base->OverridedStopGradient()) {
-        PassStopGradient(outs, var_base->OverridedStopGradient());
+        for (const auto& pair : outs) {
+          for (const auto& var : pair.second) {
+            if (var) {
+              var->SetOverridedStopGradient(false);
+              SetForwardDataTypeOfGradVar(var);
+              VLOG(3) << "Set output: " << var->Name()
+                      << "'s OverridedStopGradient as "
+                      << var->OverridedStopGradient();
+            }
+          }
+        }
         return true;
       }
     }
@@ -78,28 +89,36 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls,
   // process args,`input_vars` only collect `imperative::VarBase`
   if (!args.empty()) {
     for (auto ptr = args.begin(); ptr != args.end(); ptr++) {
-      try {
-        if (Py_None != ptr->ptr()) {
+      // Only collect Tensor type in 'args' and pass them to backward. Ignore
+      // other types of input temporarily.
+      if (py::isinstance<imperative::VarBase>(*ptr)) {
+        try {
           auto a = ptr->cast<std::shared_ptr<VarBase>>();
           input_vars.push_back(a);
+        } catch (py::cast_error& err) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The `PyLayer.forward` function contains invalid argument, the "
+              "`%s` type argument can not be cast into `Tensor`.",
+              ptr->ptr()->ob_type->tp_name));
         }
-      } catch (py::cast_error& err) {
-        // Only collect Tensor type in 'args' and pass them to backward. Ignore
-        // other types of input temporarily.
       }
     }
   }
   // process kwargs, only collect `imperative::VarBase`
   if (!kwargs.empty()) {
     for (auto ptr = kwargs.begin(); ptr != kwargs.end(); ptr++) {
-      try {
-        if (Py_None != ptr->second.ptr()) {
+      // Only collect Tensor type in 'kwargs' and pass them to backward.
+      // Ignore other types of input temporarily.
+      if (py::isinstance<imperative::VarBase>(*ptr->second)) {
+        try {
           auto a = ptr->second.cast<std::shared_ptr<VarBase>>();
           input_vars.push_back(a);
+        } catch (py::cast_error&) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The `PyLayer.forward` function contains invalid argument, the "
+              "`%s` type argument can not be cast into `Tensor`.",
+              ptr->second.ptr()->ob_type->tp_name));
         }
-      } catch (py::cast_error&) {
-        // Only collect Tensor type in 'kwargs' and pass them to backward.
-        // Ignore other types of input temporarily.
       }
     }
   }
@@ -110,33 +129,35 @@ py::object PyLayerApply(const platform::Place& place, const py::handle& cls,
       PyList_Check(result_forward.ptr())) {
     auto tuple_result = result_forward.cast<py::tuple>();
     for (size_t i = 0; i < tuple_result.size(); i++) {
-      if (Py_None != tuple_result[i].ptr()) {
+      // Only collect Tensor type of output and pass them to backward.
+      // Ignore other types of input temporarily.
+      if (py::isinstance<imperative::VarBase>(tuple_result[i])) {
         try {
           auto temp_out =
               tuple_result[i].cast<std::shared_ptr<imperative::VarBase>>();
           output_vars.push_back(temp_out);
         } catch (py::cast_error&) {
-          // Only collect Tensor type in 'kwargs' and pass them to backward.
-          // Ignore other types of input temporarily.
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The `PyLayer.forward` function returns invalid argument, the "
+              "`%s` type argument can not be cast into `Tensor`.",
+              tuple_result[i].ptr()->ob_type->tp_name));
         }
-      } else {
-        // Only collect Tensor type in 'kwargs' and pass them to backward.
-        // Ignore other types of input temporarily.
       }
     }
   } else {
-    if (Py_None != result_forward.ptr()) {
+    // Only collect Tensor type of output and pass them to backward.
+    // Ignore other types of input temporarily.
+    if (py::isinstance<imperative::VarBase>(result_forward)) {
       try {
         auto temp_out =
             result_forward.cast<std::shared_ptr<imperative::VarBase>>();
         output_vars.push_back(temp_out);
       } catch (py::cast_error&) {
-        // Only collect Tensor type in 'kwargs' and pass them to backward.
-        // Ignore other types of input temporarily.
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "The `PyLayer.forward` function returns invalid argument, the `%s` "
+            "type argument can not be cast into `Tensor`.",
+            result_forward.ptr()->ob_type->tp_name));
       }
-    } else {
-      // Only collect Tensor type in 'kwargs' and pass them to backward.
-      // Ignore other types of input temporarily.
     }
   }
   if (output_vars.size() == 0) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 2733d21b6cba3a..e628216a5ed87b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -270,7 +270,46 @@ bool AnalysisPredictor::CreateExecutor() {
   executor_.reset(new paddle::framework::NaiveExecutor(place_));
   return true;
 }
+
+static bool IsPrepareDataOptTargetOp(framework::OpDesc *op) {
+  // here is prepare data optimization related bad cases:
+  // let's assume an op behind conditional_block and if conditional_block
+  // chooses branch 1, the op need to call prepare data. else the op don't need
+  // to call prepare data. In running, if predictor chooses branch 2, then
+  // optimization takes effect, later issue is followed if predictor chooses
+  // branch 1, because the op lost chance to prepare data.
+  std::vector<std::string> op_type = {"conditional_block_infer",
+                                      "select_input"};
+  for (const auto &type : op_type) {
+    if (op->Type() == type) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static void DisablePrepareDataOpt(
+    std::shared_ptr<framework::ProgramDesc> inference_program, int block,
+    bool pre_disable_opt) {
+  bool disable_opt = false;
+  auto &infer_block = inference_program->Block(block);
+  for (auto *op : infer_block.AllOps()) {
+    if (disable_opt || pre_disable_opt) {
+      op->SetAttr("inference_force_prepare_data", true);
+    }
+    if (op->HasAttr("sub_block")) {
+      int blockID = op->GetBlockAttrId("sub_block");
+      DisablePrepareDataOpt(inference_program, blockID,
+                            disable_opt || pre_disable_opt);
+    }
+    // disable prepare data if unfriendly op is found
+    disable_opt = IsPrepareDataOptTargetOp(op);
+  }
+}
+
 bool AnalysisPredictor::PrepareExecutor() {
+  DisablePrepareDataOpt(inference_program_, 0, false);
+
   executor_->Prepare(sub_scope_, *inference_program_, 0,
                      config_.use_feed_fetch_ops_);
 
@@ -1198,6 +1237,8 @@ USE_TRT_CONVERTER(affine_channel);
 USE_TRT_CONVERTER(multiclass_nms);
 USE_TRT_CONVERTER(nearest_interp);
 USE_TRT_CONVERTER(reshape);
+USE_TRT_CONVERTER(reduce_sum);
+USE_TRT_CONVERTER(gather_nd);
 #endif
 
 namespace paddle_infer {
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 99328e60768913..2e4a175566a7a1 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -13,6 +13,8 @@ nv_library(tensorrt_converter
                 multiclass_nms_op.cc
                 nearest_interp_op.cc
                 reshape_op.cc
+                reduce_op.cc
+                gather_nd_op.cc
            DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 61199724bcfe30..6bbda6bb29aadb 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -103,11 +103,18 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
 
   TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
                               static_cast<void*>(bias_data), bias_size};
-  auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
-                           nv_ksize, weight, bias);
-  PADDLE_ENFORCE_NOT_NULL(layer,
-                          platform::errors::Fatal("TensorRT create conv2d"
-                                                  " layer error."));
+  // In conv2d_transpose and depthwise_conv2d_transpose,
+  // output channels = filter_dims[1] * groups
+  auto* layer = (op_desc.Type() == "conv2d_transpose" ||
+                 op_desc.Type() == "depthwise_conv2d_transpose")
+                    ? fadd_layer(const_cast<nvinfer1::ITensor*>(X),
+                                 n_input * groups, nv_ksize, weight, bias)
+                    : fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output,
+                                 nv_ksize, weight, bias);
+
+  PADDLE_ENFORCE_NOT_NULL(
+      layer, platform::errors::Fatal("TensorRT create conv2d/conv2d_transpose"
+                                     " layer failed."));
   layer->setStride(nv_strides);
   layer->setPadding(nv_paddings);
   layer->setNbGroups(groups);
@@ -134,7 +141,6 @@ class Conv2dOpConverter : public OpConverter {
     ConvertConv2d(
         engine_, op, scope, test_mode,
         [&](nvinfer1::ITensor* inputs, int n_output, /* Conv output maps */
-            int n_input,                             /* Conv input maps */
             nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
             TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* {
           auto* layer =
@@ -156,7 +162,6 @@ class Deconv2dOpConverter : public OpConverter {
     ConvertConv2d(
         engine_, op, scope, test_mode,
         [&](nvinfer1::ITensor* inputs, int n_output, /* Deconv input maps */
-            int n_input,                             /* Deconv output maps */
             nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
             TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* {
           auto* layer =
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index 66a682db07b911..04c51202f022f6 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -40,10 +40,19 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     auto word_emb_name = op_desc.Input("WordEmbedding").front();
     auto pos_emb_name = op_desc.Input("PosEmbedding").front();
     auto sent_emb_name = op_desc.Input("SentEmbedding").front();
-    std::vector<std::string> id_names = {word_id_name, pos_id_name,
-                                         sent_id_name};
-    std::vector<std::string> emb_names = {word_emb_name, pos_emb_name,
-                                          sent_emb_name};
+
+    std::vector<std::string> id_names;
+    std::vector<std::string> emb_names;
+
+    if (engine_->use_oss()) {
+      id_names =
+          std::vector<std::string>{word_id_name, pos_id_name, sent_id_name};
+      emb_names =
+          std::vector<std::string>{word_emb_name, pos_emb_name, sent_emb_name};
+    } else {
+      id_names = op_desc.Input("Ids");
+      emb_names = op_desc.Input("Embs");
+    }
 
     int input_num = id_names.size();
 
diff --git a/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc b/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc
new file mode 100644
index 00000000000000..489fc987dfec2a
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class GatherNdOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a paddle gather_nd op to tensorrt gather_nd plugin";
+    framework::OpDesc op_desc(op, nullptr);
+
+    // Declare inputs
+    std::vector<nvinfer1::ITensor*> inputs;
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* index = engine_->GetITensor(op_desc.Input("Index")[0]);
+    inputs.emplace_back(input);
+    inputs.emplace_back(index);
+
+    nvinfer1::ILayer* layer = nullptr;
+    bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+    plugin::GatherNdPluginDynamic* plugin =
+        new plugin::GatherNdPluginDynamic(with_fp16);
+    layer = engine_->AddDynamicPlugin(inputs.data(), inputs.size(), plugin);
+
+    std::string layer_name = "gather_nd (Output: ";
+    auto output_name = op_desc.Output("Out")[0];
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    layer_name += output_name;
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+    layer->setName((layer_name + ")").c_str());
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(gather_nd, GatherNdOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
index 0b97b5d87a3d50..de5d3110e18903 100644
--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
@@ -46,13 +46,6 @@ class LayerNormOpConverter : public OpConverter {
     auto* Bias_t = Bias_v->GetMutable<framework::LoDTensor>();
     auto* Scale_t = Scale_v->GetMutable<framework::LoDTensor>();
 
-    int input_num = 1;
-    for (int i = 0; i < X->getDimensions().nbDims; i++) {
-      input_num *= X->getDimensions().d[i];
-    }
-    std::vector<int64_t> mean_shape{input_num};
-    std::vector<int64_t> variance_shape{input_num};
-
     std::unique_ptr<framework::LoDTensor> bias_tensor(
         new framework::LoDTensor());
     std::unique_ptr<framework::LoDTensor> scale_tensor(
@@ -68,10 +61,33 @@ class LayerNormOpConverter : public OpConverter {
     auto* bias_data = bias_tensor->mutable_data<float>(platform::CPUPlace());
     auto* scale_data = scale_tensor->mutable_data<float>(platform::CPUPlace());
 
-    plugin::LayerNormPlugin* plugin = new plugin::LayerNormPlugin(
-        bias_data, bias_tensor->numel(), scale_data, scale_tensor->numel(),
-        begin_norm_axis, eps, mean_shape, variance_shape);
-    nvinfer1::IPluginLayer* layernorm_layer = engine_->AddPlugin(&X, 1, plugin);
+    nvinfer1::ILayer* layernorm_layer = nullptr;
+    if (engine_->with_dynamic_shape()) {
+      int input_num = 1;
+      for (int i = begin_norm_axis; i < X->getDimensions().nbDims; i++) {
+        input_num *= X->getDimensions().d[i];
+      }
+      std::vector<int64_t> mean_shape{input_num};
+      std::vector<int64_t> variance_shape{input_num};
+      plugin::LayerNormPluginDynamic* plugin =
+          new plugin::LayerNormPluginDynamic(bias_data, bias_tensor->numel(),
+                                             scale_data, scale_tensor->numel(),
+                                             begin_norm_axis, eps, mean_shape,
+                                             variance_shape);
+      layernorm_layer = engine_->AddDynamicPlugin(&X, 1, plugin);
+    } else {
+      int input_num = 1;
+      for (int i = begin_norm_axis - 1; i < X->getDimensions().nbDims; i++) {
+        input_num *= X->getDimensions().d[i];
+      }
+      std::vector<int64_t> mean_shape{input_num};
+      std::vector<int64_t> variance_shape{input_num};
+      plugin::LayerNormPlugin* plugin = new plugin::LayerNormPlugin(
+          bias_data, bias_tensor->numel(), scale_data, scale_tensor->numel(),
+          begin_norm_axis, eps, mean_shape, variance_shape);
+      layernorm_layer = engine_->AddPlugin(
+          &X, 1, reinterpret_cast<plugin::PluginTensorRT*>(plugin));
+    }
 
     auto output_name = op_desc.Output("Y").front();
     engine_->SetWeights(op_desc.Input("Bias").front(), std::move(bias_tensor));
diff --git a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
new file mode 100644
index 00000000000000..66d2680fe9969c
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <NvInfer.h>
+#include <sys/types.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class ReduceSumOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a paddle reduce_sum op to tensorrt reduce layer";
+    framework::OpDesc op_desc(op, nullptr);
+
+    auto* x = engine_->GetITensor(op_desc.Input("X").front());
+    nvinfer1::Dims input_shape = x->getDimensions();
+    int input_dims = input_shape.nbDims;
+
+    bool keep_dim = BOOST_GET_CONST(bool, op_desc.GetAttr("keep_dim"));
+    std::vector<int32_t> dim =
+        BOOST_GET_CONST(std::vector<int32_t>, op_desc.GetAttr("dim"));
+    bool reduce_all = BOOST_GET_CONST(bool, op_desc.GetAttr("reduce_all"));
+
+    // Now we only support dynamic_shape mode.
+    nvinfer1::IReduceLayer* layer = nullptr;
+    if (reduce_all) {
+      uint32_t reduce_dim = 0;
+      for (int i = 0; i < input_dims; ++i) {
+        reduce_dim |= 1 << i;
+      }
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x,
+                                   nvinfer1::ReduceOperation::kSUM, reduce_dim,
+                                   keep_dim);
+    } else {
+      auto CvtToBitMask = [&](const std::vector<int32_t>& dims) -> uint32_t {
+        uint32_t res = 0;
+        for (auto x : dims) {
+          if (x < 0) {
+            res |= 1 << (x + input_dims);
+          } else {
+            res |= 1 << x;
+          }
+        }
+        return res;
+      };
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x,
+                                   nvinfer1::ReduceOperation::kSUM,
+                                   CvtToBitMask(dim), keep_dim);
+    }
+
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "reduce_sum", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(reduce_sum, ReduceSumOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/reshape_op.cc b/paddle/fluid/inference/tensorrt/convert/reshape_op.cc
index 3d8c72728c6671..489603e20cda2f 100644
--- a/paddle/fluid/inference/tensorrt/convert/reshape_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/reshape_op.cc
@@ -34,7 +34,7 @@ class ReshapeOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    const std::vector<int>& shape =
+    std::vector<int> shape =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("shape"));
     int nbDims_num = shape.size();
     nvinfer1::Dims reshape_dim;
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 85c466e4644e01..0dc08a482733a3 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/data_layout.h"
 
@@ -122,11 +123,13 @@ struct SimpleOpTypeSetTeller : public Teller {
       "flatten2",
       "flatten",
       "gather",
+      "gather_nd",
       "yolo_box",
       "roi_align",
       "affine_channel",
       "nearest_interp",
       "anchor_generator",
+      "reduce_sum",
   };
 };
 
@@ -324,6 +327,30 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false;
     }
 
+    if (op_type == "gather_nd") {
+      auto* block = desc.Block();
+      auto x_var_name = desc.Input("X")[0];
+      auto index_var_name = desc.Input("Index")[0];
+      auto* x_var_desc = block->FindVar(x_var_name);
+      auto* index_var_desc = block->FindVar(index_var_name);
+
+      // The index input must be int32 datatype.
+      if (index_var_desc->GetDataType() !=
+          paddle::framework::proto::VarType_Type::VarType_Type_INT32) {
+        VLOG(3) << "gather_nd op Index input data type must be int32";
+        return false;
+      }
+
+      const auto index_shape = index_var_desc->GetShape();
+      const auto x_shape = x_var_desc->GetShape();
+      if (x_shape.size() != index_shape.size()) {
+        VLOG(3) << "gather_nd op Index input dims size [" << index_shape.size()
+                << " ] not equal to x dims size [" << x_shape.size() << "]";
+        return false;
+      }
+      if (!with_dynamic_shape) return false;
+    }
+
     if (op_type == "yolo_box") {
       if (with_dynamic_shape) return false;
       bool has_attrs =
@@ -676,7 +703,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         return false;
         // Paddle-TRT does not support the input tensors: Shape and ShapeTensor
       } else if (desc.Input("Shape").size() >= 1 ||
-                 desc.Input("ShapeTensor").size() >= 1) {
+                 desc.Input("ShapeTensor").size() >= 1 || with_dynamic_shape) {
         return false;
       } else {
         std::vector<int> shape =
@@ -684,6 +711,21 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         if (shape.size() >= nvinfer1::Dims::MAX_DIMS) return false;
       }
     }
+
+    if (op_type == "reduce_sum") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the reduce_sum does not support static shape yet";
+        return false;
+      }
+
+      if (!(desc.HasAttr("keep_dim") && desc.HasAttr("dim") &&
+            desc.HasAttr("reduce_all"))) {
+        VLOG(3) << "the reduce_sum does not have attr (keep_dim or dim or "
+                   "reduce_all)";
+        return false;
+      }
+    }
+
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
   return false;
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 1804e6c5571d3a..26125d21ad7d1a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -8,6 +8,7 @@ nv_library(tensorrt_plugin
            anchor_generator_op_plugin.cu
            yolo_box_op_plugin.cu
            roi_align_op_plugin.cu
+           gather_nd_op_plugin.cu
            DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
 
 nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu
new file mode 100644
index 00000000000000..5f4ac054c95b34
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu
@@ -0,0 +1,229 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_fp16.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <sstream>
+
+#include "NvInferRuntimeCommon.h"
+#include "paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+
+template <typename T, typename IndexT = int>
+__global__ void GatherNdCUDAKernel(const T* input, const int32_t* input_dims,
+                                   const IndexT* indices, T* output,
+                                   int32_t remain_size, int32_t slice_size,
+                                   int32_t end_size) {
+  CUDA_KERNEL_LOOP(i, remain_size * slice_size) {
+    int indices_i = i / slice_size;
+    int slice_i = i - indices_i * slice_size;  // offset inside the slice
+    IndexT gather_i = 0;
+    int32_t temp = slice_size;
+    for (int32_t j = end_size - 1; j >= 0; --j) {
+      auto index_value = indices[indices_i * end_size + j];
+      PADDLE_ENFORCE(
+          index_value >= 0 && index_value < input_dims[j],
+          "The index is out of bounds, "
+          "please check whether the dimensions of index and "
+          "input meet the requirements. It should "
+          "be less than [%d] and greater or equal to 0, but received [%d]",
+          input_dims[j], index_value);
+      gather_i += (index_value * temp);
+      temp *= input_dims[j];
+    }
+    IndexT input_i = gather_i + slice_i;
+    *(output + i) = *(input + input_i);
+  }
+}
+
+int GatherNdPluginDynamic::initialize() { return 0; }
+
+size_t GatherNdPluginDynamic::getSerializationSize() const {
+  return SerializedSize(with_fp16_);
+}
+
+void GatherNdPluginDynamic::serialize(void* buffer) const {
+  SerializeValue(&buffer, with_fp16_);
+}
+
+nvinfer1::DimsExprs GatherNdPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder) {
+  PADDLE_ENFORCE_EQ(
+      nb_inputs, 2,
+      platform::errors::InvalidArgument(
+          "The gather_nd plugin should have 2 input, but got %d.", nb_inputs));
+  PADDLE_ENFORCE_EQ(output_index, 0,
+                    platform::errors::InvalidArgument(
+                        "When GetOutputDimensions in gather_nd "
+                        "plugin, the output_index should be 0."));
+
+  nvinfer1::DimsExprs x_dims = inputs[0];
+  nvinfer1::DimsExprs index_dims = inputs[1];
+
+  int32_t x_dims_size = x_dims.nbDims;
+  int32_t index_dims_size = index_dims.nbDims;
+
+  // TODO(wilber): The result dims shoule be Index.shape[:-1] +
+  // X.shape[Index.shape[-1]:], but the trt DimsExprs is an expression we can't
+  // get the actual value. So we only support one scenario: input_dims.size ==
+  // index_dims.size.
+  nvinfer1::DimsExprs ret(x_dims);
+  for (int i = 0; i < index_dims_size - 1; ++i) {
+    ret.d[i] = index_dims.d[i];
+  }
+
+  return ret;
+}
+
+bool GatherNdPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
+    int nb_outputs) {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of gather_nd plugin should not be nullptr."));
+
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+  (in_out && pos < (nb_inputs + nb_outputs));
+
+  const nvinfer1::PluginTensorDesc& in = in_out[pos];
+  if (pos == 0) {
+    if (with_fp16_) {
+      return (in.type == nvinfer1::DataType::kFLOAT ||
+              in.type == nvinfer1::DataType::kHALF) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
+    } else {
+      return (in.type == nvinfer1::DataType::kFLOAT) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
+    }
+  } else if (pos == 1) {
+    return in.type == nvinfer1::DataType::kINT32 &&
+           in.format == nvinfer1::TensorFormat::kLINEAR;
+  } else if (pos == 2) {
+    return in.type == in_out[0].type &&
+           in.format == nvinfer1::TensorFormat::kLINEAR;
+  }
+
+  return true;
+}
+
+nvinfer1::DataType GatherNdPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* input_types, int nb_inputs) const {
+  return input_types[0];
+}
+
+int GatherNdPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc* input_desc,
+    const nvinfer1::PluginTensorDesc* output_desc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  auto input_dims = input_desc[0].dims;
+  auto index_dims = input_desc[1].dims;
+  auto input_dims_size = input_dims.nbDims;
+  auto index_dims_size = index_dims.nbDims;
+
+  std::vector<int32_t> input_shape, index_shape, out_shape;
+  for (int i = 0; i < input_dims.nbDims; i++)
+    input_shape.push_back(input_dims.d[i]);
+  for (int i = 0; i < index_dims.nbDims; i++)
+    index_shape.push_back(index_dims.d[i]);
+  // The out_shape is
+  //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
+  for (int i = 0; i < index_dims_size - 1; ++i) {
+    out_shape.emplace_back(index_shape[i]);
+  }
+  for (int i = index_shape[index_dims_size - 1]; i < input_dims_size; ++i) {
+    out_shape.emplace_back(input_shape[i]);
+  }
+
+  // final dim
+  int end_size = index_shape[index_dims_size - 1];
+  // remain dim
+  std::vector<int> remain_ddim(index_shape.begin(), index_shape.end() - 1);
+  int remain_numel = std::accumulate(remain_ddim.begin(), remain_ddim.end(), 1,
+                                     std::multiplies<int>());
+  // slice size
+  int slice_size = 1;
+  for (int i = end_size; i < input_dims_size; ++i) {
+    slice_size *= input_shape[i];
+  }
+
+  auto input_type = input_desc[0].type;
+  if (input_type == nvinfer1::DataType::kFLOAT) {
+    VLOG(1) << "TRT Plugin DataType selected. gather_nd-->fp32";
+
+    const float* p_input = static_cast<const float*>(inputs[0]);
+    const int32_t* p_index = static_cast<const int32_t*>(inputs[1]);
+    float* p_output = static_cast<float*>(outputs[0]);
+
+    if (input_dims_data_ == nullptr) {
+      cudaMalloc(&input_dims_data_, input_shape.size() * sizeof(int));
+    }
+    cudaMemcpyAsync(input_dims_data_, input_shape.data(),
+                    sizeof(int) * input_shape.size(), cudaMemcpyHostToDevice,
+                    stream);
+
+    int block = 512;
+    int n = slice_size * remain_numel;
+    int grid = (n + block - 1) / block;
+
+    GatherNdCUDAKernel<float, int32_t><<<grid, block, 0, stream>>>(
+        p_input, input_dims_data_, p_index, p_output, remain_numel, slice_size,
+        end_size);
+  } else if (input_type == nvinfer1::DataType::kHALF) {
+    VLOG(1) << "TRT Plugin DataType selected. gather_nd-->fp16";
+
+    const half* p_input = static_cast<const half*>(inputs[0]);
+    const int32_t* p_index = static_cast<const int32_t*>(inputs[1]);
+    half* p_output = static_cast<half*>(outputs[0]);
+
+    if (input_dims_data_ == nullptr) {
+      cudaMalloc(&input_dims_data_, input_shape.size() * sizeof(int));
+    }
+    cudaMemcpyAsync(input_dims_data_, input_shape.data(),
+                    sizeof(int) * input_shape.size(), cudaMemcpyHostToDevice,
+                    stream);
+
+    int block = 512;
+    int n = slice_size * remain_numel;
+    int grid = (n + block - 1) / block;
+
+    GatherNdCUDAKernel<half, int32_t><<<grid, block, 0, stream>>>(
+        p_input, input_dims_data_, p_index, p_output, remain_numel, slice_size,
+        end_size);
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
new file mode 100644
index 00000000000000..0a242238c81fb3
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
@@ -0,0 +1,132 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <thrust/device_vector.h>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+class GatherNdPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit GatherNdPluginDynamic(bool with_fp16) { with_fp16_ = with_fp16; }
+
+  GatherNdPluginDynamic(void const* serial_data, size_t serial_length) {
+    DeserializeValue(&serial_data, &serial_length, &with_fp16_);
+  }
+
+  nvinfer1::IPluginV2DynamicExt* clone() const override {
+    return new GatherNdPluginDynamic(with_fp16_);
+  }
+
+  const char* getPluginType() const override { return "gather_nd_plugin"; }
+  int getNbOutputs() const override { return 1; }
+  int initialize() override;
+
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override {}
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override {
+    return 0;
+  }
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  void destroy() override {
+    if (input_dims_data_) {
+      cudaFree(input_dims_data_);
+    }
+    delete this;
+  }
+
+ private:
+  int32_t* input_dims_data_{nullptr};
+};
+
+class GatherNdPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  GatherNdPluginDynamicCreator() {}
+  const char* getPluginName() const override { return "gather_nd_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new GatherNdPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+REGISTER_TRT_PLUGIN_V2(GatherNdPluginDynamicCreator);
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
index 8af036a0e86709..d67820a6f0af4f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
@@ -57,8 +57,18 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs,
     input_shape.push_back(input_dims.d[i]);
   }
   const auto input_ddim = framework::make_ddim(input_shape);
-  auto matrix_dim = framework::flatten_to_2d(input_ddim, begin_norm_axis - 1);
+  auto matrix_dim = framework::flatten_to_2d(input_ddim, begin_norm_axis);
   int feature_size = static_cast<int>(matrix_dim[1]);
+  PADDLE_ENFORCE_EQ(feature_size, scale_.size(),
+                    platform::errors::InvalidArgument(
+                        "scale's size should be equal to the feature_size,"
+                        "but got feature_size:%d, scale's size:%d.",
+                        feature_size, scale_.size()));
+  PADDLE_ENFORCE_EQ(feature_size, bias_.size(),
+                    platform::errors::InvalidArgument(
+                        "bias's size should be equal to the feature_size,"
+                        "but got feature_size:%d, bias's size:%d.",
+                        feature_size, bias_.size()));
 
   scale_t.Resize(framework::make_ddim({feature_size}));
   bias_t.Resize(framework::make_ddim({feature_size}));
@@ -82,6 +92,163 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs,
   return cudaGetLastError() != cudaSuccess;
 }
 
+nvinfer1::DimsExprs LayerNormPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs *inputDims, int nb_inputs,
+    nvinfer1::IExprBuilder &expr_builder) {
+  return inputDims[0];
+}
+
+bool LayerNormPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
+    int nb_outputs) {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of layernorm plugin shoule not be nullptr."));
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+  const nvinfer1::PluginTensorDesc &in = in_out[pos];
+  if (pos == 0) {
+    // TODO(Shangzhizhou) FP16 support
+    return (in.type == nvinfer1::DataType::kFLOAT) &&
+           (in.format == nvinfer1::TensorFormat::kLINEAR);
+  }
+  const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
+  // output
+  return in.type == prev.type && in.format == prev.format;
+}
+
+nvinfer1::DataType LayerNormPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
+  PADDLE_ENFORCE_EQ(index, 0,
+                    platform::errors::InvalidArgument(
+                        "The LayerNormPlugin only has one input, so the "
+                        "index value should be 0, but get %d.",
+                        index));
+  return input_types[0];
+}
+
+int LayerNormPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *input_desc,
+    const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
+    void *const *outputs, void *workspace, cudaStream_t stream) {
+  const auto &input_dims = input_desc[0].dims;
+  int begin_norm_axis = begin_norm_axis_;
+  float eps = eps_;
+
+  std::vector<int> input_shape;
+  for (int i = 0; i < input_dims.nbDims; i++) {
+    input_shape.push_back(input_dims.d[i]);
+  }
+  const auto input_ddim = framework::make_ddim(input_shape);
+  auto matrix_dim = framework::flatten_to_2d(input_ddim, begin_norm_axis);
+  int feature_size = static_cast<int>(matrix_dim[1]);
+  PADDLE_ENFORCE_EQ(feature_size, scale_.size(),
+                    platform::errors::InvalidArgument(
+                        "scale's size should be equal to the feature_size,"
+                        "but got feature_size:%d, scale's size:%d.",
+                        feature_size, scale_.size()));
+  PADDLE_ENFORCE_EQ(feature_size, bias_.size(),
+                    platform::errors::InvalidArgument(
+                        "bias's size should be equal to the feature_size,"
+                        "but got feature_size:%d, bias's size:%d.",
+                        feature_size, bias_.size()));
+  int device_id;
+  cudaGetDevice(&device_id);
+  auto input_type = input_desc[0].type;
+  if (input_type == nvinfer1::DataType::kFLOAT) {
+    VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp32";
+    const float *input = reinterpret_cast<const float *>(inputs[0]);
+    float *output = static_cast<float *>(outputs[0]);
+    scale_t.Resize(framework::make_ddim({feature_size}));
+    bias_t.Resize(framework::make_ddim({feature_size}));
+    mean_t.Resize(framework::make_ddim(mean_shape_));
+    variance_t.Resize(framework::make_ddim(variance_shape_));
+
+    float *scale_d =
+        scale_t.mutable_data<float>(platform::CUDAPlace(device_id));
+    float *bias_d = bias_t.mutable_data<float>(platform::CUDAPlace(device_id));
+    float *mean_d = mean_t.mutable_data<float>(platform::CUDAPlace(device_id));
+    float *variance_d =
+        variance_t.mutable_data<float>(platform::CUDAPlace(device_id));
+
+    cudaMemcpyAsync(scale_d, scale_.data(), sizeof(float) * feature_size,
+                    cudaMemcpyHostToDevice, stream);
+    cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size,
+                    cudaMemcpyHostToDevice, stream);
+
+    paddle::operators::LayerNormDirectCUDAFunctor<float> layer_norm;
+    layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d,
+               variance_d, begin_norm_axis, eps);
+  } else if (input_type == nvinfer1::DataType::kHALF) {
+#ifdef TRT_PLUGIN_FP16_AVALIABLE
+    VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp16";
+    const half *input = reinterpret_cast<const half *>(inputs[0]);
+    half *output = static_cast<half *>(outputs[0]);
+    size_t mean_shape_product = 1;
+    for (auto s : mean_shape_) {
+      mean_shape_product *= s;
+    }
+    size_t variance_shape_product = 1;
+    for (auto s : variance_shape_) {
+      variance_shape_product *= s;
+    }
+    if (!scale_gpu_half_d_) {
+      cudaMalloc(&scale_gpu_half_d_, feature_size * sizeof(half));
+    }
+    if (!bias_gpu_half_d_) {
+      cudaMalloc(&bias_gpu_half_d_, feature_size * sizeof(half));
+    }
+    if (!mean_gpu_half_d_) {
+      cudaMalloc(&mean_gpu_half_d_, mean_shape_product * sizeof(half));
+    }
+    if (!variance_gpu_half_d_) {
+      cudaMalloc(&variance_gpu_half_d_, variance_shape_product * sizeof(half));
+    }
+
+    half *scale_cpu_half =
+        static_cast<half *>(malloc(feature_size * sizeof(half)));
+    half *bias_cpu_half =
+        static_cast<half *>(malloc(feature_size * sizeof(half)));
+    PADDLE_ENFORCE_EQ(
+        scale_cpu_half && bias_cpu_half, true,
+        platform::errors::Unavailable("Out of memory, malloc size %d.",
+                                      feature_size * sizeof(half)));
+
+    for (int i = 0; i < feature_size; i++) {
+      scale_cpu_half[i] = static_cast<half>(scale_[i]);
+      bias_cpu_half[i] = static_cast<half>(bias_[i]);
+    }
+    cudaMemcpyAsync(scale_gpu_half_d_, scale_cpu_half,
+                    sizeof(half) * feature_size, cudaMemcpyHostToDevice,
+                    stream);
+    cudaMemcpyAsync(bias_gpu_half_d_, bias_cpu_half,
+                    sizeof(half) * feature_size, cudaMemcpyHostToDevice,
+                    stream);
+    free(scale_cpu_half);
+    free(bias_cpu_half);
+
+    paddle::operators::LayerNormDirectCUDAFunctor<half> layer_norm;
+    layer_norm(stream, input, input_shape, bias_gpu_half_d_, scale_gpu_half_d_,
+               output, mean_gpu_half_d_, variance_gpu_half_d_, begin_norm_axis,
+               eps);
+#else
+    PADDLE_THROW(platform::errors::Fatal(
+        "The layer_norm tensorRT plugin should be "
+        "complied with CUDA version >= 10.0 when running with fp16. "
+        "Please recomplie it or try to use fp32 by set "
+        "config.SetTRTDynamicShapeInfo(min_input_shape, "
+        "max_input_shape, opt_input_shape, true"));
+#endif
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "The LayerNorm TRT Plugin's input type should be float or half."));
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
index 050ef3b77d3157..1a6125b0e16ffd 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
@@ -50,7 +50,7 @@ class LayerNormPlugin : public PluginTensorRT {
   // TRT will call this func when we need to serialize the configuration of
   // tensorrt.
   // It should not be called by users.
-  void serialize(void *buffer) override {
+  void serialize(void* buffer) override {
     SerializeValue(&buffer, getPluginType());
     serializeBase(buffer);
     SerializeValue(&buffer, bias_);
@@ -62,7 +62,7 @@ class LayerNormPlugin : public PluginTensorRT {
   }
 
  public:
-  LayerNormPlugin(const float *bias, const int bias_num, const float *scale,
+  LayerNormPlugin(const float* bias, const int bias_num, const float* scale,
                   const int scale_num, int begin_norm_axis, float eps,
                   std::vector<int64_t> mean_shape,
                   std::vector<int64_t> variance_shape)
@@ -78,7 +78,7 @@ class LayerNormPlugin : public PluginTensorRT {
 
   // It was used for tensorrt deserialization.
   // It should not be called by users.
-  LayerNormPlugin(void const *serialData, size_t serialLength) {
+  LayerNormPlugin(void const* serialData, size_t serialLength) {
     deserializeBase(serialData, serialLength);
     DeserializeValue(&serialData, &serialLength, &bias_);
     DeserializeValue(&serialData, &serialLength, &scale_);
@@ -90,20 +90,180 @@ class LayerNormPlugin : public PluginTensorRT {
   ~LayerNormPlugin() {}
   int initialize() override;
 
-  LayerNormPlugin *clone() const override {
+  LayerNormPlugin* clone() const override {
     return new LayerNormPlugin(bias_.data(), bias_.size(), scale_.data(),
                                scale_.size(), begin_norm_axis_, eps_,
                                mean_shape_, variance_shape_);
   }
 
-  const char *getPluginType() const override { return "layer_norm_plugin"; }
+  const char* getPluginType() const override { return "layer_norm_plugin"; }
   int getNbOutputs() const override { return 1; }
-  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
                                      int nbInputDims) override;
-  int enqueue(int batchSize, const void *const *inputs, void **outputs,
-              void *workspace, cudaStream_t stream) override;
+  int enqueue(int batchSize, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override;
 };
 
+class LayerNormPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  LayerNormPluginDynamic(const float* bias, const int bias_num,
+                         const float* scale, const int scale_num,
+                         int begin_norm_axis, float eps,
+                         std::vector<int64_t> mean_shape,
+                         std::vector<int64_t> variance_shape)
+      : begin_norm_axis_(begin_norm_axis),
+        eps_(eps),
+        mean_shape_(mean_shape),
+        variance_shape_(variance_shape),
+        scale_gpu_half_d_(nullptr),
+        bias_gpu_half_d_(nullptr),
+        mean_gpu_half_d_(nullptr),
+        variance_gpu_half_d_(nullptr) {
+    bias_.resize(bias_num);
+    scale_.resize(scale_num);
+    std::copy(bias, bias + bias_num, bias_.data());
+    std::copy(scale, scale + scale_num, scale_.data());
+  }
+
+  LayerNormPluginDynamic(void const* serialData, size_t serialLength)
+      : scale_gpu_half_d_(nullptr),
+        bias_gpu_half_d_(nullptr),
+        mean_gpu_half_d_(nullptr),
+        variance_gpu_half_d_(nullptr) {
+    DeserializeValue(&serialData, &serialLength, &bias_);
+    DeserializeValue(&serialData, &serialLength, &scale_);
+    DeserializeValue(&serialData, &serialLength, &begin_norm_axis_);
+    DeserializeValue(&serialData, &serialLength, &eps_);
+    DeserializeValue(&serialData, &serialLength, &mean_shape_);
+    DeserializeValue(&serialData, &serialLength, &variance_shape_);
+  }
+  nvinfer1::IPluginV2DynamicExt* clone() const override {
+    return new LayerNormPluginDynamic(bias_.data(), bias_.size(), scale_.data(),
+                                      scale_.size(), begin_norm_axis_, eps_,
+                                      mean_shape_, variance_shape_);
+  }
+
+  const char* getPluginType() const override { return "layernorm_plugin"; }
+  int getNbOutputs() const override { return 1; }
+  int initialize() override { return 0; }
+
+  size_t getSerializationSize() const override {
+    return SerializedSize(bias_) + SerializedSize(scale_) +
+           SerializedSize(begin_norm_axis_) + SerializedSize(eps_) +
+           SerializedSize(mean_shape_) + SerializedSize(variance_shape_);
+  }
+
+  void serialize(void* buffer) const override {
+    SerializeValue(&buffer, bias_);
+    SerializeValue(&buffer, scale_);
+    SerializeValue(&buffer, begin_norm_axis_);
+    SerializeValue(&buffer, eps_);
+    SerializeValue(&buffer, mean_shape_);
+    SerializeValue(&buffer, variance_shape_);
+  }
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder) override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override {}
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override {
+    return 0;
+  }
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  ~LayerNormPluginDynamic() {
+    if (scale_gpu_half_d_) {
+      cudaFree(scale_gpu_half_d_);
+    }
+    if (bias_gpu_half_d_) {
+      cudaFree(bias_gpu_half_d_);
+    }
+    if (mean_gpu_half_d_) {
+      cudaFree(mean_gpu_half_d_);
+    }
+    if (variance_gpu_half_d_) {
+      cudaFree(variance_gpu_half_d_);
+    }
+  }
+
+  void destroy() override { delete this; }
+
+ private:
+  std::vector<float> bias_;
+  std::vector<float> scale_;
+  framework::Tensor scale_t;
+  framework::Tensor bias_t;
+  framework::Tensor mean_t;
+  framework::Tensor variance_t;
+  int begin_norm_axis_;
+  float eps_;
+  std::vector<int64_t> mean_shape_;
+  std::vector<int64_t> variance_shape_;
+  half* scale_gpu_half_d_;
+  half* bias_gpu_half_d_;
+  half* mean_gpu_half_d_;
+  half* variance_gpu_half_d_;
+};
+
+class LayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  LayerNormPluginDynamicCreator() {}
+  const char* getPluginName() const override { return "layernorm_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new LayerNormPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+REGISTER_TRT_PLUGIN_V2(LayerNormPluginDynamicCreator);
+
 }  // namespace plugin
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 07208d016a7908..f0eb0d1fa675b7 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -325,11 +325,10 @@ inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_te
 # densebox
 set(DENSEBOX_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/densebox")
 download_data_without_verify(${DENSEBOX_INSTALL_DIR} "densebox.tar.gz")
-#inference_analysis_test(test_analyzer_detect SRCS analyzer_detect_tester.cc 
-#  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-#  ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt 
-#       --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt)
-#set_property(TEST test_analyzer_detect PROPERTY ENVIRONMENT GLOG_vmodule=analysis_predictor=2)
+inference_analysis_test(test_analyzer_detect_functional_mkldnn SRCS analyzer_detect_functional_mkldnn_tester.cc 
+  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+  ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt 
+       --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt)
 
 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc
new file mode 100644
index 00000000000000..f157f6b0b82ea9
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc
@@ -0,0 +1,153 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/place.h"
+
+DEFINE_string(infer_shape, "", "data shape file");
+DEFINE_int32(sample, 20, "number of sample");
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+
+Record ProcessALine(const std::string &line, const std::string &shape_line) {
+  VLOG(3) << "process a line";
+
+  Record record;
+  std::vector<std::string> data_strs;
+  split(line, ' ', &data_strs);
+  for (auto &d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+
+  std::vector<std::string> shape_strs;
+  split(shape_line, ' ', &shape_strs);
+  for (auto &s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  return record;
+}
+
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
+  cfg->DisableGpu();
+  // cfg->SwitchIrDebug(); // Enable to have graphs dumped
+  cfg->SwitchSpecifyInputNames(false);
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads);
+}
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
+              const std::string &line, const std::string &shape_line) {
+  auto record = ProcessALine(line, shape_line);
+
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.dtype = PaddleDType::FLOAT32;
+  size_t input_size = record.data.size() * sizeof(float);
+  input.data.Resize(input_size);
+  memcpy(input.data.data(), record.data.data(), input_size);
+  std::vector<PaddleTensor> input_slots;
+  input_slots.assign({input});
+  (*inputs).emplace_back(input_slots);
+}
+
+#ifdef PADDLE_WITH_MKLDNN
+int GetNumCachedObjects(void) {
+  auto &pool = platform::DeviceContextPool::Instance();
+  platform::CPUPlace place;
+  auto onednn_dev_ctx =
+      dynamic_cast<platform::MKLDNNDeviceContext *>(pool.Get(place));
+  return onednn_dev_ctx->GetCachedObjectsNumber();
+}
+
+void validate_cache_onednn(int cache_capacity = 1) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  cfg.EnableMKLDNN();
+  cfg.SetMkldnnCacheCapacity(cache_capacity);
+
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  std::vector<std::vector<PaddleTensor>> ref_outputs;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+
+  std::ifstream file(FLAGS_infer_data);
+  std::ifstream infer_file(FLAGS_infer_shape);
+  std::vector<std::string> lines;
+  std::vector<std::string> shape_lines;
+
+  // Let's work with 4 samples
+  auto num_samples = 4;
+  ref_outputs.resize(num_samples);
+  lines.resize(num_samples);
+  shape_lines.resize(num_samples);
+
+  // Let's remember number of cached objects before
+  // execution and after every single execution
+  std::vector<int> cache_filling;
+  cache_filling.push_back(GetNumCachedObjects());
+
+  // compute sequentially prediction
+  for (int i = 0; i < num_samples; ++i) {
+    std::getline(file, lines[i]);
+    std::getline(infer_file, shape_lines[i]);
+    SetInput(&input_slots_all, lines[i], shape_lines[i]);
+    predictor->Run(input_slots_all[i], &ref_outputs[i], FLAGS_batch_size);
+    // record number of cached objects
+    cache_filling.push_back(GetNumCachedObjects());
+  }
+
+  file.close();
+  infer_file.close();
+
+  predictor.reset(nullptr);
+  cache_filling.push_back(GetNumCachedObjects());
+
+  // Compare results
+  // First and last value should be equal e.g. before using cache (empty) and
+  // after releasing executor
+  PADDLE_ENFORCE_EQ(
+      cache_filling[0], cache_filling[cache_filling.size() - 1],
+      platform::errors::Fatal("Cache size before execution and after "
+                              "releasing Executor do not match"));
+
+  // Iterate to check if cache is not increasing
+  // over exceeding cache capacity
+  if (cache_capacity != 0) {
+    for (int i = cache_capacity + 1; i < num_samples + 1; ++i) {
+      PADDLE_ENFORCE_EQ(
+          cache_filling[cache_capacity], cache_filling[i],
+          platform::errors::Fatal("Cache capacity should not increase "
+                                  "after full capacity is used"));
+    }
+  }
+}
+
+TEST(Analyzer_detect, validate_cache_onednn) {
+  validate_cache_onednn(2 /*cache_capacity */);
+}
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
index e911c94208711e..adb6aa4d75344d 100644
--- a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
@@ -167,7 +167,7 @@ def run_convert():
                os.path.getsize(output_file) == FULL_SIZE_BYTES):
         if os.path.exists(output_file):
             sys.stderr.write(
-                "\n\nThe existing binary file is broken. Start to generate new one...\n\n".
+                "\n\nThe existing binary file[{}] is broken. Start to generate new one...\n\n".
                 format(output_file))
             os.remove(output_file)
         if retry < try_limit:
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
index 53b91f540cefe4..26280cd2bd1d32 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -42,13 +42,11 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
 
     found_inf->mutable_data<bool>(ctx.GetPlace());
 
-    bool found_inf_data = false;
-
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    // step1: inverse scale(RealDiv)
+    // step1: inverse scale
     Tensor const_tensor;
     const_tensor.mutable_data<T>({1}, ctx.GetPlace());
     FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
@@ -66,7 +64,6 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
     // NOTE(zhiqiu):
     Tensor tmp;
     tmp.mutable_data<float>({8}, ctx.GetPlace());
-
     // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
     // tmp is only placeholder.
     const auto& runner_float_status =
@@ -81,39 +78,26 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
                     {{"axes", std::vector<int>{0}}, {"keep_dims", true}});
     runner_reduce_sum.Run(stream);
 
-    std::vector<float> sum_vec;
-    TensorToVector(
-        sum, ctx.template device_context<paddle::platform::NPUDeviceContext>(),
-        &sum_vec);
-    found_inf_data = (sum_vec[0] > 1);
-
-    VLOG(4) << "found_inf_data:" << found_inf_data;
-
+    const auto& runner_greater =
+        NpuOpRunner("GreaterEqual", {sum, const_tensor}, {*found_inf}, {});
+    runner_greater.Run(stream);
+
+    // NOTE(zhiqiu): The normal logic is :
+    // out = in, if found_inf = true
+    // out = in/scale, if found_inf = false
+    // However, on NPU, in order to avoid stream sync, we do not copy the
+    // found_inf data to cpu to check whether to unscale or not.
+    // Instead, we do the Mul no matter found_inf or not.
+    // And, a fact is, only few steps contains nan/inf during training.
     for (size_t i = 0; i < xs.size(); ++i) {
       const auto* x = xs[i];
       auto* out = outs[i];
       out->mutable_data<T>(ctx.GetPlace());
-      if (!found_inf_data) {
-        // MatMul
-        const auto& runner_matmul =
-            NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
-        runner_matmul.Run(stream);
-      }
+      const auto& runner_mul =
+          NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
+      runner_mul.Run(stream);
     }
 
-    // set found_inf to true
-    VLOG(4) << "found overflow:" << found_inf_data;
-    Tensor found_inf_tensor;
-    found_inf_tensor.Resize({1});
-    bool* is_found_inf =
-        found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
-    *is_found_inf = found_inf_data;
-
-    framework::TensorCopy(
-        found_inf_tensor, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), found_inf);
-    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-
     const auto& runner_clear_status =
         NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
     runner_clear_status.Run(stream);
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 153fa529f96a59..c1c4f14582e95e 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -120,6 +120,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
                 : len;
       }
     } else if (context.Attr<bool>("set_constant")) {
+      // TODO(Liu yuang) ADD NPU SET_CONSTANT FUNCTION.
       math::SetConstant<DeviceContext, T> set_constant;
       set_constant(dev_ctx, fused_tensor,
                    static_cast<T>(context.Attr<float>("constant")));
@@ -145,6 +146,14 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
     offset = 0;
     std::stringstream ss;
     ss << "alloc_space_for_vars: ";
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto stream =
+        context.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    platform::NPUMemsetAsync(
+        static_cast<void *>(fused_tensor->mutable_data<T>(dev_ctx.GetPlace())),
+        0.0, fused_tensor->numel() * sizeof(T), stream);
+#endif
     for (size_t i = 0; i < out_tensors.size(); ++i) {
       size_t len = static_cast<size_t>(out_tensors[i]->numel());
       auto dim = out_tensors[i]->dims();
@@ -160,6 +169,12 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
       ss << "output(" << out_var_names[i] << ")  dim:(" << dim << ")"
          << " address: " << out_tensors[i]->data<void>() << ", ";
     }
+    PADDLE_ENFORCE_EQ(
+        (int64_t)offset, fused_tensor->numel(),
+        platform::errors::InvalidArgument(
+            "The alloc_space_for_vars's offset: %s is unequal with "
+            "fused_tensor's numel: %s.",
+            offset, fused_tensor->numel()));
     VLOG(10) << ss.str();
   }
 
@@ -191,13 +206,13 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
       ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
          << ") "
          << " addres:" << lod_tensors[i]->data<void>() << ", ";
+
       *numel += use_align
                     ? platform::Alignment(
                           static_cast<size_t>(size) * size_of_dtype, place) /
                           size_of_dtype
                     : static_cast<size_t>(size);
     }
-
     VLOG(10) << ss.str();
   }
 };
@@ -309,6 +324,16 @@ REGISTER_OP_XPU_KERNEL(
     ops::CoalesceTensorOpKernel<paddle::platform::XPUDeviceContext, double>);
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+REGISTER_OP_NPU_KERNEL(
+    coalesce_tensor,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext,
+                                plat::float16>,
+    ops::CoalesceTensorOpKernel<paddle::platform::CPUDeviceContext, double>);
+#endif
+
 REGISTER_OP_VERSION(coalesce_tensor)
     .AddCheckpoint(
         R"ROC(
diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc
index f6281aa8ca2710..b8631b44f14caa 100644
--- a/paddle/fluid/operators/collective/barrier_op.cu.cc
+++ b/paddle/fluid/operators/collective/barrier_op.cu.cc
@@ -43,12 +43,10 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
     ncclRedOp_t nccl_red_type = ncclSum;
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
-    auto comm_stream =
-        platform::NCCLCommContext::Instance().Get(rid, place)->stream();
 #ifdef PADDLE_WITH_RCCL
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(comm_stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(comm_stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
 #endif
 #else
     PADDLE_THROW(platform::errors::Unavailable(
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cc b/paddle/fluid/operators/collective/c_embedding_op.cc
new file mode 100644
index 00000000000000..3055e2ceb23dd2
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_embedding_op.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_embedding_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CEmbeddingOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CEmbeddingOp");
+    OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "CEmbeddingOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CEmbeddingOp");
+
+    auto table_dims = ctx->GetInputDim("W");
+    auto ids_dims = ctx->GetInputDim("Ids");
+    int ids_rank = ids_dims.size();
+
+    VLOG(5) << "ids rank is " << ids_rank << std::endl;
+    PADDLE_ENFORCE_EQ(table_dims.size(), 2,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of the 'c_embedding' must be 2. "
+                          "But received c_embedding's dimensions = %d, "
+                          "c_embedding's shape = [%s].",
+                          table_dims.size(), table_dims));
+
+    auto output_dims = framework::vectorize(ids_dims);
+    output_dims.push_back(table_dims[1]);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
+
+    if (ctx->GetOutputsVarType("Out")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      ctx->ShareLoD("Ids", /*->*/ "Out");
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "W");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class CEmbeddingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("W",
+             "(Tensor) The input represents embedding tensors, "
+             "which is a learnable parameter.");
+    AddInput("Ids",
+             "An input with type int64 "
+             "contains the ids to be looked up in W.");
+    AddOutput("Out", "The lookup results, which have the same type as W.");
+
+    AddAttr<int64_t>("start_index",
+                     "(int64, default 0), The starting index is indeed, "
+                     "and the out-of-bounds will be set to 0 ")
+        .SetDefault(0);
+    AddComment(R"DOC(
+c_embedding Operator.
+
+This operator is used to perform lookups on the parameter W,
+then concatenated into a dense tensor.
+
+The input Ids can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD information with input Ids.
+
+)DOC");
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(CEmbeddingGradOpNoBufferVarsInferer, "W");
+
+template <typename T>
+class CEmbeddingGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("c_embedding_grad");
+
+    op->SetInput("W", this->Input("W"));
+    op->SetInput("Ids", this->Input("Ids"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("W"), this->InputGrad("W"));
+
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+class CEmbeddingOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto table_dims = ctx->GetInputDim("W");
+    ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class CEmbeddingOpGradVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto out_var_name = framework::GradVarName("W");
+    VLOG(3) << "c_embedding_grad op " << framework::GradVarName("W")
+            << " is set to LoDTensor";
+    ctx->SetOutputType(out_var_name, framework::proto::VarType::LOD_TENSOR);
+    ctx->SetOutputDataType(out_var_name, ctx->GetInputDataType("W"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(c_embedding, ops::CEmbeddingOp, ops::CEmbeddingOpMaker,
+                  ops::CEmbeddingGradOpMaker<paddle::framework::OpDesc>,
+                  ops::CEmbeddingGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(c_embedding_grad, ops::CEmbeddingOpGrad,
+                  ops::CEmbeddingGradOpNoBufferVarsInferer,
+                  ops::CEmbeddingOpGradVarTypeInference);
+
+REGISTER_OP_CPU_KERNEL(c_embedding, ops::CEmbeddingOpCPUKernel<float>,
+                       ops::CEmbeddingOpCPUKernel<double>);
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu
new file mode 100644
index 00000000000000..ecf3887eef4ac6
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_embedding_op.cu
@@ -0,0 +1,161 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/collective/c_embedding_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T, typename IndexT>
+__global__ void CEmbedding(T *out, const T *table, const IndexT *ids,
+                           const int rows, const int columns, const int64_t N,
+                           const int64_t start_idx, const int64_t end_idx,
+                           const int64_t limit) {
+  CUDA_KERNEL_LOOP(i, limit) {
+    size_t row = i / columns;
+    size_t col = i % columns;
+    auto id = ids[row];
+
+    if (id >= start_idx && id < end_idx) {
+      auto real_idx = id - start_idx;
+      PADDLE_ENFORCE(real_idx < N,
+                     "The index is out of bounds, "
+                     "please check whether the dimensions of index and "
+                     "input meet the requirements. It should "
+                     "be less than [%d], but received [%d]",
+                     N, real_idx);
+      out[i] = table[real_idx * columns + col];
+    } else {
+      out[i] = static_cast<T>(0);
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+__global__ void CEmbeddingGrad(T *table, const T *output, const IndexT *ids,
+                               const int rows, const int columns,
+                               const int64_t N, const int64_t start_idx,
+                               const int64_t end_idx, const int64_t limit) {
+  CUDA_KERNEL_LOOP(i, limit) {
+    size_t row = i / columns;
+    size_t col = i % columns;
+    auto id = ids[row];
+    if (id >= start_idx && id < end_idx) {
+      auto real_idx = id - start_idx;
+      paddle::platform::CudaAtomicAdd(&table[real_idx * columns + col],
+                                      output[i]);
+    }
+  }
+}
+
+template <typename T>
+class CEmbeddingCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_t = context.Input<LoDTensor>("W");
+    auto *ids_t = context.Input<LoDTensor>("Ids");
+    auto *output_t = context.Output<LoDTensor>("Out");
+
+    const auto &dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    const int64_t start_idx = context.Attr<int64_t>("start_index");
+    size_t N = table_t->dims()[0];
+    size_t D = table_t->dims()[1];
+    size_t K = ids_t->numel();
+
+    const int64_t end_idx = start_idx + N;
+
+    auto *table = table_t->data<T>();
+    auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+    auto limit = K * D;
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+
+    const auto &index_type = ids_t->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      CEmbedding<T, int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          output, table, ids_t->data<int32_t>(), K, D, N, start_idx, end_idx,
+          limit);
+
+    } else if (index_type == framework::proto::VarType::INT64) {
+      CEmbedding<T, int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          output, table, ids_t->data<int64_t>(), K, D, N, start_idx, end_idx,
+          limit);
+    }
+  }
+};
+
+template <typename T>
+class CEmbeddingGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const auto &dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    const int64_t start_idx = context.Attr<int64_t>("start_index");
+    auto ids_t = context.Input<LoDTensor>("Ids");
+    auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto d_table_t = context.Output<LoDTensor>(framework::GradVarName("W"));
+
+    int N = d_table_t->dims()[0];
+    int D = d_table_t->dims()[1];
+    int K = ids_t->numel();
+
+    const int64_t end_idx = start_idx + N;
+    auto limit = K * D;
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+
+    const T *d_output = d_output_t->data<T>();
+    T *d_table = d_table_t->mutable_data<T>(context.GetPlace());
+
+    auto t = framework::EigenVector<T>::Flatten(*d_table_t);
+    t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
+
+    const auto &index_type = ids_t->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      CEmbeddingGrad<T, int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          d_table, d_output, ids_t->data<int32_t>(), K, D, N, start_idx,
+          end_idx, limit);
+    } else if (index_type == framework::proto::VarType::INT64) {
+      CEmbeddingGrad<T, int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          d_table, d_output, ids_t->data<int64_t>(), K, D, N, start_idx,
+          end_idx, limit);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(c_embedding, ops::CEmbeddingCUDAKernel<float>,
+                        ops::CEmbeddingCUDAKernel<double>,
+                        ops::CEmbeddingCUDAKernel<plat::float16>);
+REGISTER_OP_CUDA_KERNEL(c_embedding_grad, ops::CEmbeddingGradCUDAKernel<float>,
+                        ops::CEmbeddingGradCUDAKernel<double>,
+                        ops::CEmbeddingGradCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_embedding_op.h b/paddle/fluid/operators/collective/c_embedding_op.h
new file mode 100644
index 00000000000000..3cab6d7184441d
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_embedding_op.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+class CEmbeddingOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_embedding for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cc
new file mode 100644
index 00000000000000..f75e1b3c7aedcc
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cc
@@ -0,0 +1,194 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CSoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Logits"), "Input", "Logits",
+                   "CSoftmaxWithCrossEntropyOp");
+    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label",
+                   "CSoftmaxWithCrossEntropyOp");
+
+    OP_INOUT_CHECK(ctx->HasOutput("Softmax"), "Output", "Softmax",
+                   "CSoftmaxWithCrossEntropyOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss",
+                   "CSoftmaxWithCrossEntropyOp");
+
+    auto logits_dims = ctx->GetInputDim("Logits");
+    auto labels_dims = ctx->GetInputDim("Label");
+
+    auto logits_rank = logits_dims.size();
+    auto axis = logits_rank - 1;
+    for (int i = 0; i < logits_rank; i++) {
+      if (i != axis) {
+        if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) {
+          PADDLE_ENFORCE_EQ(logits_dims[i], labels_dims[i],
+                            platform::errors::InvalidArgument(
+                                "Input(Logits) and Input(Label) should in "
+                                "same shape in dimensions except axis."));
+        }
+      }
+    }
+
+    PADDLE_ENFORCE_EQ(
+        labels_dims[logits_rank - 1], 1UL,
+        platform::errors::InvalidArgument(
+            "the last dimension of Input(Label) should be 1."
+            "But received: the last dimension of Input(Label) is [%d],"
+            "the last dimension is [%d]",
+            labels_dims[logits_rank - 1], logits_rank - 1));
+
+    ctx->SetOutputDim("Softmax", logits_dims);
+
+    logits_dims[axis] = 1;
+    ctx->SetOutputDim("Loss", logits_dims);
+
+    ctx->ShareLoD("Logits", /*->*/ "Softmax");
+    ctx->ShareLoD("Logits", /*->*/ "Loss");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Logits"),
+        ctx.device_context());
+  }
+};
+
+class CSoftmaxWithCrossEntropyOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("Logits",
+             "(Tensor, default: Tensor<float>), The input tensor of unscaled "
+             "log probabilities, whose dimension :attr:`axis` should be scaled "
+             "by softmax.");
+    AddInput(
+        "Label",
+        "(Tensor) The input tensor of groud truth label. If :attr:`soft_label` "
+        "is set to false, Label is a Tensor<int64> in same shape with "
+        "Input(Logits) except the shape in dimension :attr:`axis` as 1. If "
+        "soft_label is set to true, Label is a Tensor<float/double> in same "
+        "shape with Input(Logits).");
+    AddOutput(
+        "Softmax",
+        "(Tensor, default: Tensor<float>), A tensor in same shape with "
+        "Input(Logits). "
+        "The outputs value of softmax activation by given the input batch, "
+        "which will be used in backward calculation.");
+    AddOutput("Loss",
+              "(Tensor, default: Tensor<float>), A tensor in same shape with "
+              "Input(Logits) "
+              "except the shape in dimension :attr:`axis` as 1. The cross "
+              "entropy loss.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<int>("rank",
+                 "(int default 0) rank id for CSoftmaxWithCrossEntropy.")
+        .SetDefault(0);
+    AddAttr<int>("nranks",
+                 "(int default 1) nranks id for CSoftmaxWithCrossEntropy.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+CSoftmaxWithCrossEntropy Operator
+
+)DOC");
+  }
+};
+
+class CSoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Loss")), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Loss@Grad) should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Softmax) should be not null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Label"), true,
+        platform::errors::InvalidArgument("Input(Label) should be not null."));
+
+    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Logits")), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Logits@Grad) should be not null."));
+
+    ctx->SetOutputDim(framework::GradVarName("Logits"),
+                      ctx->GetInputDim("Softmax"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Loss")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class CSoftmaxWithCrossEntropyOpGradMaker
+    : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("c_softmax_with_cross_entropy_grad");
+
+    op->SetInput("Softmax", this->Output("Softmax"));
+    op->SetInput("Label", this->Input("Label"));
+    op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
+    op->SetAttrMap(this->Attrs());
+    op->SetOutput(framework::GradVarName("Logits"), this->InputGrad("Logits"));
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(CSoftmaxWithCrossEntropyInplaceInferer,
+                           {"Logits", "Softmax"});
+
+DECLARE_INPLACE_OP_INFERER(CSoftmaxWithCrossEntropyGradInplaceInferer,
+                           {"Softmax", framework::GradVarName("Logits")});
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(
+    c_softmax_with_cross_entropy, ops::CSoftmaxWithCrossEntropyOp,
+    ops::CSoftmaxWithCrossEntropyOpMaker,
+    ops::CSoftmaxWithCrossEntropyOpGradMaker<paddle::framework::OpDesc>,
+    ops::CSoftmaxWithCrossEntropyOpGradMaker<paddle::imperative::OpBase>,
+    ops::CSoftmaxWithCrossEntropyInplaceInferer);
+
+REGISTER_OPERATOR(c_softmax_with_cross_entropy_grad,
+                  ops::CSoftmaxWithCrossEntropyOpGrad,
+                  ops::CSoftmaxWithCrossEntropyGradInplaceInferer);
+
+REGISTER_OP_CPU_KERNEL(c_softmax_with_cross_entropy,
+                       ops::CSoftmaxWithCrossEntropyOpCPUKernel<float>,
+                       ops::CSoftmaxWithCrossEntropyOpCPUKernel<double>,
+                       ops::CSoftmaxWithCrossEntropyOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
new file mode 100644
index 00000000000000..77db86e7111112
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -0,0 +1,262 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/math/softmax_impl.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T, typename IndexT>
+__global__ void MaskLabelByIndex(T* predicted_logits, const T* logit,
+                                 const IndexT* label, const int start_index,
+                                 const int end_index, const int64_t N,
+                                 const int64_t D, const int nranks) {
+  CUDA_KERNEL_LOOP(i, N) {
+    auto real_label = label[i];
+    PADDLE_ENFORCE((real_label < D * nranks) && (real_label >= 0),
+                   "The index is out of bounds, "
+                   "please check whether the value of label and "
+                   "input meet the class number. It should "
+                   "be less than [%d], but received [%d]",
+                   D * nranks, real_label);
+
+    if (real_label >= start_index && real_label < end_index) {
+      predicted_logits[i] = logit[i * D + real_label - start_index];
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+__global__ void MaskLabelByIndexGrad(T* logits_grad, const T* loss_grad,
+                                     const IndexT* labels,
+                                     const int start_index, const int end_index,
+                                     const int64_t N, const int64_t D) {
+  CUDA_KERNEL_LOOP(i, N * D) {
+    auto row = i / D;
+    auto col = i % D;
+    if ((col + start_index) == labels[row]) {
+      logits_grad[i] = (logits_grad[i] - static_cast<T>(1.0)) * loss_grad[row];
+    } else {
+      logits_grad[i] *= loss_grad[row];
+    }
+  }
+}
+
+template <typename T>
+class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* logits = ctx.Input<Tensor>("Logits");
+    const Tensor* labels = ctx.Input<Tensor>("Label");
+    Tensor* softmax = ctx.Output<Tensor>("Softmax");
+    Tensor* loss = ctx.Output<Tensor>("Loss");
+
+    const int rid = ctx.Attr<int>("ring_id");
+    const int nranks = ctx.Attr<int>("nranks");
+    const int rank = ctx.Attr<int>("rank");
+
+    const auto& place = ctx.GetPlace();
+    const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    // use global calculate stream
+    const auto stream = static_cast<platform::CUDADeviceContext*>(
+                            platform::DeviceContextPool::Instance().Get(place))
+                            ->stream();
+
+    // allocate memory on device.
+    softmax->mutable_data<T>(place);
+    loss->mutable_data<T>(place);
+
+    const auto& logits_dims = logits->dims();
+    const auto& labels_dims = labels->dims();
+
+    const int axis = logits_dims.size() - 1;
+    const int N = SizeToAxis(axis, logits_dims);
+    const int D = SizeFromAxis(axis, logits_dims);
+
+    Tensor logits_2d, softmax_2d, loss_2d;
+    logits_2d.ShareDataWith(*logits).Resize({N, D});
+    softmax_2d.ShareDataWith(*softmax).Resize({N, D});
+    loss_2d.ShareDataWith(*loss).Resize({N, 1});
+
+    auto eigen_logits = math::EigenMatrix<T>::From(logits_2d);
+    auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);
+
+    // step 1, obtain logit_max
+    Tensor logits_max;
+    logits_max =
+        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    void* logits_max_buff = logits_max.mutable_data<T>(place);
+
+    auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
+    Eigen::DSizes<int, 1> along_axis(1);
+    eigen_logits_max.device(*dev_ctx.eigen_device()) =
+        eigen_logits.maximum(along_axis);
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        logits_max_buff, logits_max_buff, logits_max.numel(),
+        platform::ToNCCLDataType(logits_max.type()), ncclMax, comm->comm(),
+        stream));
+
+    // step 2, obtain logit - logit_max
+    Eigen::DSizes<int, 2> batch_by_one(N, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, D);
+
+    eigen_softmax.device(*dev_ctx.eigen_device()) =
+        (eigen_logits -
+         eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class))
+            .unaryExpr(math::ValueClip<T>());
+
+    // step 3, obtain predict target
+    Tensor predicted_logits;
+    predicted_logits =
+        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    predicted_logits.mutable_data<T>(place);
+
+    auto t = framework::EigenVector<T>::Flatten(predicted_logits);
+    t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
+
+    const int start_index = rank * D;
+    const int end_index = start_index + D;
+
+    int blocks = NumBlocks(N);
+    int threads = kNumCUDAThreads;
+    const auto& label_type = labels->type();
+
+    if (label_type == framework::proto::VarType::INT32) {
+      MaskLabelByIndex<T, int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          predicted_logits.data<T>(), softmax_2d.data<T>(),
+          labels->data<int32_t>(), start_index, end_index, N, D, nranks);
+    } else if (label_type == framework::proto::VarType::INT64) {
+      MaskLabelByIndex<T, int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          predicted_logits.data<T>(), softmax_2d.data<T>(),
+          labels->data<int64_t>(), start_index, end_index, N, D, nranks);
+    }
+
+    void* predict_logits_buff = predicted_logits.mutable_data<T>(place);
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        predict_logits_buff, predict_logits_buff, predicted_logits.numel(),
+        platform::ToNCCLDataType(predicted_logits.type()), ncclSum,
+        comm->comm(), stream));
+
+    // step 4, obtain exp(logit)
+    eigen_softmax.device(*dev_ctx.eigen_device()) = eigen_softmax.exp();
+
+    // step 5, obtain sum_exp_logits
+    Tensor sum_exp_logits;
+    sum_exp_logits =
+        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
+
+    auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
+    eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
+        eigen_softmax.sum(along_axis);
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+        sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(),
+        platform::ToNCCLDataType(sum_exp_logits.type()), ncclSum, comm->comm(),
+        stream));
+
+    auto eigen_loss = math::EigenMatrix<T>::From(loss_2d);
+    auto eigen_predicted_logits = math::EigenMatrix<T>::From(predicted_logits);
+
+    eigen_loss.device(*dev_ctx.eigen_device()) =
+        (eigen_sum_exp_logits.log().unaryExpr(math::TolerableValue<T>()) -
+         eigen_predicted_logits)
+            .unaryExpr(math::TolerableValue<T>());
+
+    eigen_softmax.device(*dev_ctx.eigen_device()) =
+        (eigen_softmax *
+         eigen_sum_exp_logits.inverse().broadcast(one_by_class));
+  }
+};
+
+template <typename T>
+class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* labels = context.Input<Tensor>("Label");
+    const Tensor* loss_grad =
+        context.Input<Tensor>(framework::GradVarName("Loss"));
+    Tensor* logit_grad =
+        context.Output<Tensor>(framework::GradVarName("Logits"));
+    const Tensor* softmax = context.Input<Tensor>("Softmax");
+    const int rank = context.Attr<int>("rank");
+    auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+
+    if (logit_grad != softmax) {
+      framework::TensorCopy(*softmax, context.GetPlace(),
+                            context.device_context(), logit_grad);
+    }
+    const auto sofrmax_dims = softmax->dims();
+    const int axis = sofrmax_dims.size() - 1;
+    const int N = SizeToAxis(axis, sofrmax_dims);
+    const int D = SizeFromAxis(axis, sofrmax_dims);
+
+    Tensor logit_grad_2d;
+    logit_grad_2d.ShareDataWith(*logit_grad).Resize({N, D});
+
+    int blocks = NumBlocks(N * D);
+    int threads = kNumCUDAThreads;
+    const auto& label_type = labels->type();
+    const int start_index = rank * D;
+    const int end_index = start_index + D;
+
+    if (label_type == framework::proto::VarType::INT32) {
+      MaskLabelByIndexGrad<T,
+                           int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          logit_grad_2d.data<T>(), loss_grad->data<T>(),
+          labels->data<int32_t>(), start_index, end_index, N, D);
+    } else if (label_type == framework::proto::VarType::INT64) {
+      MaskLabelByIndexGrad<T,
+                           int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          logit_grad_2d.data<T>(), loss_grad->data<T>(),
+          labels->data<int64_t>(), start_index, end_index, N, D);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    c_softmax_with_cross_entropy,
+    ops::CSoftmaxWithCrossEntropyOpCUDAKernel<float>,
+    ops::CSoftmaxWithCrossEntropyOpCUDAKernel<double>,
+    ops::CSoftmaxWithCrossEntropyOpCUDAKernel<plat::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    c_softmax_with_cross_entropy_grad,
+    ops::CSoftmaxWithCrossEntropyGradCUDAKernel<float>,
+    ops::CSoftmaxWithCrossEntropyGradCUDAKernel<paddle::platform::float16>,
+    ops::CSoftmaxWithCrossEntropyGradCUDAKernel<double>);
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
new file mode 100644
index 00000000000000..c7cfd41fa25568
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/cross_entropy.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSoftmaxWithCrossEntropyOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_embedding for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_split_op.cc b/paddle/fluid/operators/collective/c_split_op.cc
index 03046d571d0f05..37ec989f3f9812 100644
--- a/paddle/fluid/operators/collective/c_split_op.cc
+++ b/paddle/fluid/operators/collective/c_split_op.cc
@@ -45,6 +45,12 @@ class CSplitOp : public framework::OperatorWithKernel {
                           rank, nranks));
 
     framework::DDim dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(
+        dim[dim.size() - 1] % nranks, 0,
+        platform::errors::InvalidArgument("The last dimension (%d) of the X "
+                                          "should be divisible by nranks (%d)",
+                                          dim[dim.size() - 1], nranks));
+
     dim[dim.size() - 1] = dim[dim.size() - 1] / nranks;
     if (dim[0] < 0) dim[0] = -1;
     ctx->SetOutputDim("Out", dim);
diff --git a/paddle/fluid/operators/collective/c_split_op.cu.cc b/paddle/fluid/operators/collective/c_split_op.cu
similarity index 65%
rename from paddle/fluid/operators/collective/c_split_op.cu.cc
rename to paddle/fluid/operators/collective/c_split_op.cu
index 92a7f5e41b1d2d..034accbb480c78 100644
--- a/paddle/fluid/operators/collective/c_split_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_split_op.cu
@@ -16,10 +16,38 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_split_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
 
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void SplitFromRank(const T* input, T* output, const int rows,
+                              const int columns, const int rank,
+                              const int nranks, const int limit) {
+  CUDA_KERNEL_LOOP(i, limit) {
+    int row = i / columns;
+    int col = i % columns;
+
+    int block = columns / nranks;
+    int start = block * rank;
+    int end = start + block;
+
+    if (col >= start && col < end) {
+      int idx = block * row + col % block;
+      output[idx] = input[i];
+    }
+  }
+}
+
 template <typename T>
 class CSplitOpCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -47,24 +75,25 @@ class CSplitOpCUDAKernel : public framework::OpKernel<T> {
                           rank, nranks));
 
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    std::vector<const framework::Tensor*> shape_refer;
-    std::vector<framework::Tensor*> results;
-    size_t numel = x->numel();
     auto dims = x->dims();
-    numel /= nranks;
-    int axis = dims.size() - 1;
-    dims[dims.size() - 1] /= nranks;
-    for (int i = 0; i < nranks; i++) {
-      framework::Tensor* out = new framework::Tensor();
-      out->mutable_data<T>(dims, place);
-      shape_refer.emplace_back(out);
-      results.emplace_back(out);
-    }
+    auto dims_size = dims.size();
+    // final dim
+    int64_t end_size = dims[dims_size - 1];
 
-    math::SplitFunctor<platform::CUDADeviceContext, T> functor;
-    functor(dev_ctx, *x, shape_refer, axis, &results);
+    // remain dim
+    auto remain_ddim = framework::slice_ddim(dims, 0, dims_size - 1);
+    int64_t remain_numel = framework::product(remain_ddim);
+
+    int limit = x->numel();
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+
+    dims[dims_size - 1] /= nranks;
     out->mutable_data<T>(dims, place);
-    paddle::framework::TensorCopySync(*results[rank], out->place(), out);
+
+    SplitFromRank<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+        x->data<T>(), out->data<T>(), remain_numel, end_size, rank, nranks,
+        limit);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/compat/affine_channel.pbtxt b/paddle/fluid/operators/compat/affine_channel.pbtxt
new file mode 100644
index 00000000000000..444fde59a9631c
--- /dev/null
+++ b/paddle/fluid/operators/compat/affine_channel.pbtxt
@@ -0,0 +1,41 @@
+type: "affine_channel"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Scale"
+  }
+  inputs {
+    name: "Bias"
+  }
+  attrs {
+    name: "data_layout"
+    type: STRING
+  }
+  outputs {
+    name: "Out"
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/batch_norm.pbtxt b/paddle/fluid/operators/compat/batch_norm.pbtxt
new file mode 100644
index 00000000000000..c18b4dc19dc2e7
--- /dev/null
+++ b/paddle/fluid/operators/compat/batch_norm.pbtxt
@@ -0,0 +1,94 @@
+type: "batch_norm"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Scale"
+  }
+  inputs {
+    name: "Bias"
+  }
+  inputs {
+    name: "Mean"
+  }
+  inputs {
+    name: "Variance"
+  }
+  outputs {
+    name: "Y"
+  }
+  attrs {
+    name: "epsilon"
+    type: FLOAT
+  }
+}
+extra {
+  inputs {
+    name: "MomentumTensor"
+  }
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "momentum"
+    type: FLOAT
+  }
+  attrs {
+    name: "data_layout"
+    type: STRING
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "fuse_with_relu"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_global_stats"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "trainable_statistics"
+    type: BOOLEAN
+  }
+  outputs {
+    name: "MeanOut"
+  }
+  outputs {
+    name: "VarianceOut"
+  }
+  outputs {
+    name: "SavedMean"
+  }
+  outputs {
+    name: "SavedVariance"
+  }
+  outputs {
+    name: "ReserveSpace"
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/concat.pbtxt b/paddle/fluid/operators/compat/concat.pbtxt
new file mode 100644
index 00000000000000..54c8e089829eb1
--- /dev/null
+++ b/paddle/fluid/operators/compat/concat.pbtxt
@@ -0,0 +1,50 @@
+type: "concat"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "AxisTensor"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/conv2d_transpose.pbtxt b/paddle/fluid/operators/compat/conv2d_transpose.pbtxt
new file mode 100644
index 00000000000000..7e3ecb22152b56
--- /dev/null
+++ b/paddle/fluid/operators/compat/conv2d_transpose.pbtxt
@@ -0,0 +1,110 @@
+type: "reduce_mean"
+def {
+  inputs {
+    name: "Input"
+  }
+  inputs {
+    name: "Filter"
+  }
+  inputs {
+    name: "Bias"
+  }  
+  outputs {
+    name: "Output"
+  }
+  attrs {
+    name: "output_padding"
+    type: INTS
+  }
+  attrs {
+    name: "output_size"
+    type: INTS
+  }
+  attrs {
+    name: "groups"
+    type: INT
+  }
+  attrs {
+    name: "dilations"
+    type: INTS
+  }
+  attrs {
+    name: "strides"
+    type: INTS
+  }
+  attrs {
+    name: "paddings"
+    type: INTS
+  }
+  attrs {
+    name: "padding_algorithm"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "force_fp32_output"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_relu"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "fuse_activation"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_beta"
+    type: FLOAT
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+  attrs {
+    name: "workspace_size_MB"
+    type: INT
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/elementwise_add.pbtxt b/paddle/fluid/operators/compat/elementwise_add.pbtxt
new file mode 100644
index 00000000000000..3e96147ef88ebb
--- /dev/null
+++ b/paddle/fluid/operators/compat/elementwise_add.pbtxt
@@ -0,0 +1,73 @@
+type: "elementwise_add"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "x_data_format"
+    type: STRING
+    # no longer to use
+  }
+  attrs {
+    name: "y_data_format"
+    type: STRING
+    # no longer to use
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+    # no longer to use, Use 'mkldnn_data_type' instead.
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/elementwise_div.pbtxt b/paddle/fluid/operators/compat/elementwise_div.pbtxt
new file mode 100644
index 00000000000000..40e9d90dbfd89a
--- /dev/null
+++ b/paddle/fluid/operators/compat/elementwise_div.pbtxt
@@ -0,0 +1,74 @@
+type: "elementwise_div"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "x_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "y_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+  attrs {
+    name: "act"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/elementwise_pow.pbtxt b/paddle/fluid/operators/compat/elementwise_pow.pbtxt
new file mode 100644
index 00000000000000..3ad21423e40aba
--- /dev/null
+++ b/paddle/fluid/operators/compat/elementwise_pow.pbtxt
@@ -0,0 +1,74 @@
+type: "elementwise_pow"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "x_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "y_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+  attrs {
+    name: "act"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/elementwise_sub.pbtxt b/paddle/fluid/operators/compat/elementwise_sub.pbtxt
new file mode 100644
index 00000000000000..b449e76ca06443
--- /dev/null
+++ b/paddle/fluid/operators/compat/elementwise_sub.pbtxt
@@ -0,0 +1,74 @@
+type: "elementwise_sub"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "x_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "y_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+  attrs {
+    name: "act"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fake_channel_wise_quantize_abs_max.pbtxt b/paddle/fluid/operators/compat/fake_channel_wise_quantize_abs_max.pbtxt
new file mode 100644
index 00000000000000..22954c9ba22ce4
--- /dev/null
+++ b/paddle/fluid/operators/compat/fake_channel_wise_quantize_abs_max.pbtxt
@@ -0,0 +1,46 @@
+type: "fake_channel_wise_quantize_abs_max"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "OutScale"
+  }
+  attrs {
+    name: "quant_axis"
+    type: INT
+  }
+  attrs {
+    name: "bit_length"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fake_dequantize_max_abs.pbtxt b/paddle/fluid/operators/compat/fake_dequantize_max_abs.pbtxt
new file mode 100644
index 00000000000000..0a55c0e44862ce
--- /dev/null
+++ b/paddle/fluid/operators/compat/fake_dequantize_max_abs.pbtxt
@@ -0,0 +1,38 @@
+type: "fake_dequantize_max_abs"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Scale"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "max_range"
+    type: FLOAT
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fake_quantize_abs_max.pbtxt b/paddle/fluid/operators/compat/fake_quantize_abs_max.pbtxt
new file mode 100644
index 00000000000000..92ee54eb94c0e1
--- /dev/null
+++ b/paddle/fluid/operators/compat/fake_quantize_abs_max.pbtxt
@@ -0,0 +1,38 @@
+type: "fake_quantize_abs_max"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "OutScale"
+  }
+  attrs {
+    name: "bit_length"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fake_quantize_moving_average_abs_max.pbtxt b/paddle/fluid/operators/compat/fake_quantize_moving_average_abs_max.pbtxt
new file mode 100644
index 00000000000000..dddb58f827ea03
--- /dev/null
+++ b/paddle/fluid/operators/compat/fake_quantize_moving_average_abs_max.pbtxt
@@ -0,0 +1,61 @@
+type: "fake_quantize_moving_average_abs_max"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "InScale"
+  }
+  inputs {
+    name: "InAccum"
+  }
+  inputs {
+    name: "InState"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "OutScale"
+  }
+  outputs {
+    name: "OutState"
+  }
+  outputs {
+    name: "OutAccum"
+  }
+  attrs {
+    name: "moving_rate"
+    type: FLOAT
+  }
+  attrs {
+    name: "bit_length"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fake_quantize_range_abs_max.pbtxt b/paddle/fluid/operators/compat/fake_quantize_range_abs_max.pbtxt
new file mode 100644
index 00000000000000..1050b724ee6b44
--- /dev/null
+++ b/paddle/fluid/operators/compat/fake_quantize_range_abs_max.pbtxt
@@ -0,0 +1,55 @@
+type: "fake_quantize_range_abs_max"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "InScale"
+  }
+  inputs {
+    name: "Iter"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "OutScale"
+  }
+  outputs {
+    name: "OutScales"
+  }
+  attrs {
+    name: "window_size"
+    type: INT
+  }
+  attrs {
+    name: "bit_length"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fc.pbtxt b/paddle/fluid/operators/compat/fc.pbtxt
new file mode 100644
index 00000000000000..55e1a22ce4da5f
--- /dev/null
+++ b/paddle/fluid/operators/compat/fc.pbtxt
@@ -0,0 +1,97 @@
+type: "fc"
+def {
+  inputs {
+    name: "Input"
+  }
+  inputs {
+    name: "W"
+  }
+  inputs {
+    name: "Bias"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "in_num_col_dims"
+    type: INT
+  }
+  attrs {
+    name: "activation_type"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "padding_weights"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "@ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE@"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "weight_scale"
+    type: FLOATS
+  }
+  attrs {
+    name: "Input_scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "out_scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "out_threshold"
+    type: FLOAT
+  }
+  attrs {
+    name: "force_fp32_output"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "enable_int8"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_fc_padding"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_gpu"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fill_constant.pbtxt b/paddle/fluid/operators/compat/fill_constant.pbtxt
new file mode 100644
index 00000000000000..b525da04a0d88b
--- /dev/null
+++ b/paddle/fluid/operators/compat/fill_constant.pbtxt
@@ -0,0 +1,61 @@
+type: "fill_constant"
+def {
+  inputs {
+    name: "ValueTensor"
+  }
+  inputs {
+    name: "ShapeTensor"
+  }
+  inputs {
+    name: "ShapeTensorList"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "dtype"
+    type: INT
+  }
+  attrs {
+    name: "shape"
+    type: LONGS
+  }
+  attrs {
+    name: "value"
+    type: FLOAT
+  }
+  attrs {
+    name: "str_value"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "force_cpu"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "place_type"
+    type: INT
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/flatten2.pbtxt b/paddle/fluid/operators/compat/flatten2.pbtxt
new file mode 100755
index 00000000000000..6b8a6661a6fd7d
--- /dev/null
+++ b/paddle/fluid/operators/compat/flatten2.pbtxt
@@ -0,0 +1,38 @@
+type: "flatten2"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "XShape"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/gru.pbtxt b/paddle/fluid/operators/compat/gru.pbtxt
new file mode 100644
index 00000000000000..38aa8a92f75bd9
--- /dev/null
+++ b/paddle/fluid/operators/compat/gru.pbtxt
@@ -0,0 +1,65 @@
+type: "gru"
+def {
+  inputs {
+    name: "Input"
+  }
+  inputs {
+    name: "H0"
+  }
+  inputs {
+    name: "Weight"
+  }
+  inputs {
+    name: "Bias"
+  }
+  outputs {
+    name: "BatchGate"
+  }
+  outputs {
+    name: "BatchResetHiddenPrev"
+  }
+  outputs {
+    name: "BatchHidden"
+  }
+  outputs {
+    name: "Hidden"
+  }
+  attrs {
+    name: "activation"
+    type: STRING
+  }
+  attrs {
+    name: "gate_activation"
+    type: STRING
+  }
+  attrs {
+    name: "is_reverse"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "origin_mode"
+    type: BOOLEAN
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/layer_norm.pbtxt b/paddle/fluid/operators/compat/layer_norm.pbtxt
new file mode 100644
index 00000000000000..dbb78e0a8baa1e
--- /dev/null
+++ b/paddle/fluid/operators/compat/layer_norm.pbtxt
@@ -0,0 +1,63 @@
+type: "layer_norm"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Scale"
+  }
+  inputs {
+    name: "Bias"
+  }
+  outputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Mean"
+  }
+  outputs {
+    name: "Variance"
+  }
+  attrs {
+    name: "epsilon"
+    type: FLOAT
+  }
+  attrs {
+    name: "begin_norm_axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/lstm.pbtxt b/paddle/fluid/operators/compat/lstm.pbtxt
new file mode 100644
index 00000000000000..889911a8408cb0
--- /dev/null
+++ b/paddle/fluid/operators/compat/lstm.pbtxt
@@ -0,0 +1,72 @@
+type: "lstm"
+def {
+  inputs {
+    name: "Input"
+  }
+  inputs {
+    name: "H0"
+  }
+  inputs {
+    name: "C0"
+  }
+  inputs {
+    name: "Weight"
+  }
+  inputs {
+    name: "Bias"
+  }
+  outputs {
+    name: "Hidden"
+  }
+  outputs {
+    name: "Cell"
+  }
+  outputs {
+    name: "BatchGate"
+  }
+  outputs {
+    name: "BatchCellPreAct"
+  }
+  attrs {
+    name: "use_peepholes"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "is_reverse"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "gate_activation"
+    type: STRING
+  }
+  attrs {
+    name: "cell_activation"
+    type: STRING
+  }
+  attrs {
+    name: "candidate_activation"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/matmul.pbtxt b/paddle/fluid/operators/compat/matmul.pbtxt
new file mode 100644
index 00000000000000..e68a7f31b66340
--- /dev/null
+++ b/paddle/fluid/operators/compat/matmul.pbtxt
@@ -0,0 +1,98 @@
+type: "matmul"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "transpose_X"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "transpose_Y"
+    type: BOOLEAN
+  }
+}
+extra {
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "force_fp32_output"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "fused_reshape_Out"
+    type: INTS
+  }
+  attrs {
+    name: "fused_reshape_X"
+    type: INTS
+  }
+  attrs {
+    name: "fused_reshape_Y"
+    type: INTS
+  }
+  attrs {
+    name: "fused_transpose_Out"
+    type: INTS
+  }
+  attrs {
+    name: "fused_transpose_X"
+    type: INTS
+  }
+  attrs {
+    name: "fused_transpose_Y"
+    type: INTS
+  }
+}
diff --git a/paddle/fluid/operators/compat/matmul_v2.pbtxt b/paddle/fluid/operators/compat/matmul_v2.pbtxt
new file mode 100644
index 00000000000000..5f43e1f8bf0e0c
--- /dev/null
+++ b/paddle/fluid/operators/compat/matmul_v2.pbtxt
@@ -0,0 +1,42 @@
+type: "matmul_v2"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "trans_x"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "trans_y"
+    type: BOOLEAN
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/mul.pbtxt b/paddle/fluid/operators/compat/mul.pbtxt
new file mode 100644
index 00000000000000..b40c05ad2e0333
--- /dev/null
+++ b/paddle/fluid/operators/compat/mul.pbtxt
@@ -0,0 +1,87 @@
+type: "mul"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "x_num_col_dims"
+    type: INT
+  }
+  attrs {
+    name: "y_num_col_dims"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "skip_quant"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "scale_y"
+    type: FLOATS
+  }
+  attrs {
+    name: "scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "force_fp32_output"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "enable_int8"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "X_scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "weight_scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "out_scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "out_threshold"
+    type: FLOAT
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+
+}
diff --git a/paddle/fluid/operators/compat/pool2d.pbtxt b/paddle/fluid/operators/compat/pool2d.pbtxt
new file mode 100644
index 00000000000000..1620d1ef1c649a
--- /dev/null
+++ b/paddle/fluid/operators/compat/pool2d.pbtxt
@@ -0,0 +1,92 @@
+type: "pool2d"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "pooling_type"
+    type: STRING
+  }
+  attrs {
+    name: "ksize"
+    type: INTS
+  }
+  attrs {
+    name: "global_pooling"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "strides"
+    type: INTS
+  }
+  attrs {
+    name: "paddings"
+    type: INTS
+  }
+  attrs {
+    name: "exclusive"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "adaptive"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "ceil_mode"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+  attrs {
+    name: "padding_algorithm"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/reduce_mean.pbtxt b/paddle/fluid/operators/compat/reduce_mean.pbtxt
new file mode 100644
index 00000000000000..eea6ad127fd452
--- /dev/null
+++ b/paddle/fluid/operators/compat/reduce_mean.pbtxt
@@ -0,0 +1,55 @@
+type: "reduce_mean"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "dim"
+    type: INTS
+  }
+  attrs {
+    name: "keep_dim"
+    type: BOOLEAN
+  }
+}
+extra {
+  attrs {
+    name: "reduce_all"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "in_dtype"
+    type: INT
+  }
+  attrs {
+    name: "out_dtype"
+    type: INT
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/relu.pbtxt b/paddle/fluid/operators/compat/relu.pbtxt
new file mode 100644
index 00000000000000..359bd70c2a310c
--- /dev/null
+++ b/paddle/fluid/operators/compat/relu.pbtxt
@@ -0,0 +1,43 @@
+type: "relu"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+}
diff --git a/paddle/fluid/operators/compat/reshape2.pbtxt b/paddle/fluid/operators/compat/reshape2.pbtxt
new file mode 100644
index 00000000000000..2ccc83305baca9
--- /dev/null
+++ b/paddle/fluid/operators/compat/reshape2.pbtxt
@@ -0,0 +1,53 @@
+type: "reshape2"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "shape"
+    type: INTS
+  }
+}
+extra {
+  inputs {
+    name: "Shape"
+  }
+  inputs {
+    name: "ShapeTensor"
+  }
+  outputs {
+    name: "XShape"
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/scale.pbtxt b/paddle/fluid/operators/compat/scale.pbtxt
new file mode 100644
index 00000000000000..1331cd5cd77a62
--- /dev/null
+++ b/paddle/fluid/operators/compat/scale.pbtxt
@@ -0,0 +1,43 @@
+type: "scale"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "bias"
+    type: FLOAT
+  }
+  attrs {
+    name: "scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "bias_after_scale"
+    type: BOOLEAN
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/seqconv.pbtxt b/paddle/fluid/operators/compat/seqconv.pbtxt
new file mode 100644
index 00000000000000..d05aabcc0aa409
--- /dev/null
+++ b/paddle/fluid/operators/compat/seqconv.pbtxt
@@ -0,0 +1,34 @@
+type: "sequence_conv"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Filter"
+  }
+  inputs {
+    name: "PaddingData"
+  }
+  outputs {
+    name: "Out"
+  }
+}
+extra {
+  attrs {
+    name: "paddingTrainable"
+    type: BOOLEAN
+  }
+   attrs {
+    name: "contextLength"
+    type: INT
+   }
+   attrs {
+    name: "contextStart"
+    type: INT
+   }
+   attrs {
+    name: "contextStride"
+    type: INT
+   }
+   
+}
diff --git a/paddle/fluid/operators/compat/sequence_expand.pbtxt b/paddle/fluid/operators/compat/sequence_expand.pbtxt
new file mode 100644
index 00000000000000..38169d7b57ded8
--- /dev/null
+++ b/paddle/fluid/operators/compat/sequence_expand.pbtxt
@@ -0,0 +1,38 @@
+type: "sequence_expand"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "ref_level"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/sigmoid.pbtxt b/paddle/fluid/operators/compat/sigmoid.pbtxt
new file mode 100644
index 00000000000000..7b53aa402c1183
--- /dev/null
+++ b/paddle/fluid/operators/compat/sigmoid.pbtxt
@@ -0,0 +1,39 @@
+type: "sigmoid"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/softmax.pbtxt b/paddle/fluid/operators/compat/softmax.pbtxt
new file mode 100644
index 00000000000000..5cd155ed1c63a8
--- /dev/null
+++ b/paddle/fluid/operators/compat/softmax.pbtxt
@@ -0,0 +1,55 @@
+type: "softmax"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+}
diff --git a/paddle/fluid/operators/compat/sqrt.pbtxt b/paddle/fluid/operators/compat/sqrt.pbtxt
new file mode 100644
index 00000000000000..2dbcba802a4086
--- /dev/null
+++ b/paddle/fluid/operators/compat/sqrt.pbtxt
@@ -0,0 +1,39 @@
+type: "sqrt"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/squeeze2.pbtxt b/paddle/fluid/operators/compat/squeeze2.pbtxt
new file mode 100644
index 00000000000000..160e6a72786494
--- /dev/null
+++ b/paddle/fluid/operators/compat/squeeze2.pbtxt
@@ -0,0 +1,38 @@
+type: "squeeze2"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "XShape"
+  }
+  attrs {
+    name: "axes"
+    type: INTS
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/tanh.pbtxt b/paddle/fluid/operators/compat/tanh.pbtxt
new file mode 100644
index 00000000000000..a0e6cf8a0a90ad
--- /dev/null
+++ b/paddle/fluid/operators/compat/tanh.pbtxt
@@ -0,0 +1,39 @@
+type: "tanh"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/transpose.pdtxt b/paddle/fluid/operators/compat/transpose.pdtxt
new file mode 100644
index 00000000000000..97081e0afc29a8
--- /dev/null
+++ b/paddle/fluid/operators/compat/transpose.pdtxt
@@ -0,0 +1,52 @@
+type: "transpose"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INTS
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/transpose2.pdtxt b/paddle/fluid/operators/compat/transpose2.pdtxt
new file mode 100644
index 00000000000000..34fad62a101e0d
--- /dev/null
+++ b/paddle/fluid/operators/compat/transpose2.pdtxt
@@ -0,0 +1,55 @@
+type: "transpose"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INTS
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+}
+extra {
+  outputs {
+    name: "XShape"
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/unsqueeze2.pbtxt b/paddle/fluid/operators/compat/unsqueeze2.pbtxt
new file mode 100644
index 00000000000000..ed3c32754a59f0
--- /dev/null
+++ b/paddle/fluid/operators/compat/unsqueeze2.pbtxt
@@ -0,0 +1,44 @@
+type: "unsqueeze2"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "AxesTensor"
+  }
+  inputs {
+    name: "AxesTensorList"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "XShape"
+  }
+  attrs {
+    name: "axes"
+    type: INTS
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu
index a60201f9d07d69..cc0c46adb119a1 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_op.cu
@@ -13,18 +13,84 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 
-REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor,
-                        paddle::operators::GreaterThanFunctor);
-REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor,
-                        paddle::operators::GreaterEqualFunctor);
-REGISTER_COMPARE_KERNEL(greater_than, CUDA,
-                        paddle::operators::GreaterThanFunctor,
-                        paddle::operators::LessThanFunctor);
-REGISTER_COMPARE_KERNEL(greater_equal, CUDA,
-                        paddle::operators::GreaterEqualFunctor,
-                        paddle::operators::LessEqualFunctor);
-REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor,
-                        paddle::operators::EqualFunctor);
-REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor,
-                        paddle::operators::NotEqualFunctor);
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+namespace paddle {
+namespace operators {
+
+#define DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(func, op) \
+  template <typename T, typename Enable = void>               \
+  struct func {                                               \
+    using ELEMENT_TYPE = T;                                   \
+    inline HOSTDEVICE bool operator()(const T* args) const {  \
+      return args[0] op args[1];                              \
+    }                                                         \
+  };
+
+DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaLessThanFunctor, <)
+DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaLessEqualFunctor, <=)
+DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaGreaterThanFunctor, >)
+DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaGreaterEqualFunctor, >=)
+DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaEqualFunctor, ==)
+DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaNotEqualFunctor, !=)
+#undef DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT
+
+template <typename T>
+struct CudaEqualFunctor<
+    T, typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  using ELEMENT_TYPE = T;
+  HOSTDEVICE bool operator()(const T* args) const {
+    return fabs(static_cast<double>(args[0] - args[1])) < 1e-8;
+  }
+};
+
+template <typename T>
+struct CudaNotEqualFunctor<
+    T, typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  using ELEMENT_TYPE = T;
+  HOSTDEVICE bool operator()(const T* args) const {
+    return fabs(static_cast<double>(args[0] - args[1])) > 1e-8;
+  }
+};
+
+template <typename Functor, typename InverseFunctor>
+class CompareOpKernel<platform::CUDADeviceContext, Functor, InverseFunctor>
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+ public:
+  using InT = typename Functor::ELEMENT_TYPE;
+  using OutT = bool;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto functor = Functor();
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, InT, OutT>(
+        cuda_ctx, ins, &outs, axis, functor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_CUDA_COMPARE_KERNEL(op_type, func)                            \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      op_type,                                                                 \
+      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<int>, void>,     \
+      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<int64_t>, void>, \
+      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<float>, void>,   \
+      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<double>, void>);
+
+REGISTER_CUDA_COMPARE_KERNEL(equal, CudaEqualFunctor)
+REGISTER_CUDA_COMPARE_KERNEL(not_equal, CudaNotEqualFunctor)
+REGISTER_CUDA_COMPARE_KERNEL(less_than, CudaLessThanFunctor)
+REGISTER_CUDA_COMPARE_KERNEL(less_equal, CudaLessEqualFunctor)
+REGISTER_CUDA_COMPARE_KERNEL(greater_than, CudaGreaterThanFunctor)
+REGISTER_CUDA_COMPARE_KERNEL(greater_equal, CudaGreaterEqualFunctor)
+#undef REGISTER_CUDA_COMPARE_KERNEL
diff --git a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
index 62019be26cdef8..6705d42bcd7408 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
@@ -73,6 +73,8 @@ class ConditionalBlockInferOp : public ConditionalOp {
 
       framework::Executor exec(dev_place);
       auto *block = Attr<framework::BlockDesc *>("sub_block");
+      VLOG(3) << "Conditional block.idx = " << block->ID()
+              << ", scope = " << &cur_scope;
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
       scope.DeleteScope(scopes->front());
     }
diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu
index 7ca54b488bfbb2..6cbcd516e08264 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cu
+++ b/paddle/fluid/operators/controlflow/logical_op.cu
@@ -13,12 +13,68 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/controlflow/logical_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 
-REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA,
-                               paddle::operators::LogicalAndFunctor);
-REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CUDA,
-                               paddle::operators::LogicalOrFunctor);
-REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CUDA,
-                              paddle::operators::LogicalNotFunctor);
-REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CUDA,
-                               paddle::operators::LogicalXorFunctor);
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+namespace paddle {
+namespace operators {
+
+#define LOGICAL_BINARY_FUNCTOR(func_name, op)         \
+  template <typename T>                               \
+  struct func_name {                                  \
+    using ELEMENT_TYPE = T;                           \
+    HOSTDEVICE bool operator()(const T* args) const { \
+      return args[0] op args[1];                      \
+    }                                                 \
+  };
+
+LOGICAL_BINARY_FUNCTOR(CudaOrFunctor, ||)
+LOGICAL_BINARY_FUNCTOR(CudaAndFunctor, &&)
+LOGICAL_BINARY_FUNCTOR(CudaXorFunctor, ^)
+#undef LOGICAL_BINARY_FUNCTOR
+
+template <typename T>
+struct CudaNotFunctor {
+  using ELEMENT_TYPE = T;
+  HOSTDEVICE bool operator()(const T* args) const { return !args[0]; }
+};
+
+template <typename Functor>
+class BinaryLogicalOpKernel<platform::CUDADeviceContext, Functor>
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using InT = typename Functor::ELEMENT_TYPE;
+  using OutT = bool;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto functor = Functor();
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+    int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
+
+    if (ins.size() == 1) {
+      LaunchElementwiseCudaKernel<ElementwiseType::kUnary, InT, OutT>(
+          cuda_ctx, ins, &outs, axis, functor);
+    } else {
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, InT, OutT>(
+          cuda_ctx, ins, &outs, axis, functor);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_LOGICAL_CUDA_KERNEL(op_name, func) \
+  REGISTER_OP_CUDA_KERNEL(                          \
+      op_name,                                      \
+      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<bool>>);
+
+REGISTER_LOGICAL_CUDA_KERNEL(logical_or, CudaOrFunctor)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_and, CudaAndFunctor)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, CudaXorFunctor)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_not, CudaNotFunctor)
+#undef REGISTER_LOGICAL_CUDA_KERNEL
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index c7eac903a8cc0d..c6cd45dc18ba32 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -211,20 +211,31 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
 
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
-        args.cdesc.desc(), CUDNN_DEFAULT_MATH));
-    VLOG(5) << "NOT use cudnn_tensor_op_math";
     if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
                                                          CUDNN_TENSOR_OP_MATH));
       VLOG(5) << "use cudnn_tensor_op_math";
-    } else if (dtype == CUDNN_DATA_FLOAT && !args.cdesc.allow_tf32_) {
 #if CUDA_VERSION >= 11000
+#if CUDNN_VERSION_MIN(8, 1, 0)
+    } else if (dev_ctx.GetComputeCapability() >= 80 &&
+               dtype == CUDNN_DATA_BFLOAT16) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
+                                                         CUDNN_TENSOR_OP_MATH));
+      VLOG(5) << "use cudnn_tensor_op_math";
+#endif  // CUDNN_VERSION >= 8100
+    } else if (dtype == CUDNN_DATA_FLOAT && !args.cdesc.allow_tf32_) {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
                                                          CUDNN_FMA_MATH));
+      VLOG(5) << "use cudnn_fma_math";
 #endif  // CUDA_VERSION >= 11000
+    } else {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
+                                                         CUDNN_DEFAULT_MATH));
+      VLOG(5) << "use cudnn_default_math";
     }
 #endif
 
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index 7fdb1ccfe9614f..c49a3ee1c20ed3 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -1413,6 +1413,31 @@ REGISTER_OP_KERNEL(
     paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
     paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
 #else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNConvOpKernel<float>,
+                   paddle::operators::CUDNNConvOpKernel<double>,
+                   paddle::operators::CUDNNConvOpKernel<plat::float16>,
+                   paddle::operators::CUDNNConvOpKernel<plat::bfloat16>);
+REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNConvGradOpKernel<float>,
+                   paddle::operators::CUDNNConvGradOpKernel<double>,
+                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>,
+                   paddle::operators::CUDNNConvGradOpKernel<plat::bfloat16>);
+REGISTER_OP_KERNEL(
+    conv2d_grad_grad, CUDNN, plat::CUDAPlace,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    depthwise_conv2d_grad_grad,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
+#else
 REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<float>,
                    paddle::operators::CUDNNConvOpKernel<double>,
@@ -1432,6 +1457,7 @@ REGISTER_OP_CUDA_KERNEL(
     paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
     paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
     paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
+#endif
 
 REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<float>,
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 17ce109610b426..1266cfe6081acf 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -199,6 +199,15 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
                       platform::errors::InvalidArgument(
                           "float16 can only be used when CUDNN is used"));
   }
+#if PADDLE_WITH_CUDA
+  if (input_data_type == framework::proto::VarType::BF16 &&
+      library == framework::LibraryType::kCUDNN) {
+    PADDLE_ENFORCE_GE(
+        platform::CudnnVersion(), 8100,
+        platform::errors::InvalidArgument(
+            "bfloat16 can only be used when CUDNN_VERSION >= 8100"));
+  }
+#endif  // PADDLE_WITH_CUDA
 
   auto type = framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
                                       library, customized_type_value);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 6f2a3ca8762384..e6f6c2a39358fd 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -11,6 +11,7 @@
 
 #include "paddle/fluid/operators/detection/yolo_box_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -31,19 +32,44 @@ class YoloBoxOp : public framework::OperatorWithKernel {
     auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
     int anchor_num = anchors.size() / 2;
     auto class_num = ctx->Attrs().Get<int>("class_num");
+    auto iou_aware = ctx->Attrs().Get<bool>("iou_aware");
+    auto iou_aware_factor = ctx->Attrs().Get<float>("iou_aware_factor");
 
     PADDLE_ENFORCE_EQ(dim_x.size(), 4, platform::errors::InvalidArgument(
                                            "Input(X) should be a 4-D tensor."
                                            "But received X dimension(%s)",
                                            dim_x.size()));
-    PADDLE_ENFORCE_EQ(
-        dim_x[1], anchor_num * (5 + class_num),
-        platform::errors::InvalidArgument(
-            "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
-            "+ class_num))."
-            "But received dim[1](%s) != (anchor_mask_number * "
-            "(5+class_num)(%s).",
-            dim_x[1], anchor_num * (5 + class_num)));
+    if (iou_aware) {
+      PADDLE_ENFORCE_EQ(
+          dim_x[1], anchor_num * (6 + class_num),
+          platform::errors::InvalidArgument(
+              "Input(X) dim[1] should be equal to (anchor_mask_number * (6 "
+              "+ class_num)) while iou_aware is true."
+              "But received dim[1](%s) != (anchor_mask_number * "
+              "(6+class_num)(%s).",
+              dim_x[1], anchor_num * (6 + class_num)));
+      PADDLE_ENFORCE_GE(
+          iou_aware_factor, 0,
+          platform::errors::InvalidArgument(
+              "Attr(iou_aware_factor) should greater than or equal to 0."
+              "But received iou_aware_factor (%s)",
+              iou_aware_factor));
+      PADDLE_ENFORCE_LE(
+          iou_aware_factor, 1,
+          platform::errors::InvalidArgument(
+              "Attr(iou_aware_factor) should less than or equal to 1."
+              "But received iou_aware_factor (%s)",
+              iou_aware_factor));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          dim_x[1], anchor_num * (5 + class_num),
+          platform::errors::InvalidArgument(
+              "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
+              "+ class_num))."
+              "But received dim[1](%s) != (anchor_mask_number * "
+              "(5+class_num)(%s).",
+              dim_x[1], anchor_num * (5 + class_num)));
+    }
     PADDLE_ENFORCE_EQ(dim_imgsize.size(), 2,
                       platform::errors::InvalidArgument(
                           "Input(ImgSize) should be a 2-D tensor."
@@ -140,6 +166,10 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
                    "Scale the center point of decoded bounding "
                    "box. Default 1.0")
         .SetDefault(1.);
+    AddAttr<bool>("iou_aware", "Whether use iou aware. Default false.")
+        .SetDefault(false);
+    AddAttr<float>("iou_aware_factor", "iou aware factor. Default 0.5.")
+        .SetDefault(0.5);
     AddComment(R"DOC(
          This operator generates YOLO detection boxes from output of YOLOv3 network.
          
@@ -147,7 +177,8 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
          should be the same, H and W specify the grid size, each grid point predict 
          given number boxes, this given number, which following will be represented as S,
          is specified by the number of anchors. In the second dimension(the channel
-         dimension), C should be equal to S * (5 + class_num), class_num is the object 
+         dimension), C should be equal to S * (5 + class_num) if :attr:`iou_aware` is false,
+         otherwise C should be equal to S * (6 + class_num). class_num is the object
          category number of source dataset(such as 80 in coco dataset), so the 
          second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
          also includes confidence score of the box and class one-hot key of each anchor 
@@ -183,6 +214,15 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
          score_{pred} = score_{conf} * score_{class}
          $$
 
+         where the confidence scores follow the formula bellow
+
+         .. math::
+
+            score_{conf} = \begin{case}
+                             obj, \text{if } iou_aware == flase \\
+                             obj^{1 - iou_aware_factor} * iou^{iou_aware_factor}, \text{otherwise}
+                           \end{case}
+
          )DOC");
   }
 };
@@ -197,3 +237,12 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel<float>,
                        ops::YoloBoxKernel<double>);
+
+REGISTER_OP_VERSION(yolo_box)
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade yolo box to add new attribute [iou_aware, iou_aware_factor].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewAttr("iou_aware", "Whether use iou aware", false)
+            .NewAttr("iou_aware_factor", "iou aware factor", 0.5f));
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index 65dc73ef383235..83a0eb87d02dd5 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -28,7 +28,8 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
                             const int w, const int an_num, const int class_num,
                             const int box_num, int input_size_h,
                             int input_size_w, bool clip_bbox, const float scale,
-                            const float bias) {
+                            const float bias, bool iou_aware,
+                            const float iou_aware_factor) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
   T box[4];
@@ -43,23 +44,29 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
     int img_height = imgsize[2 * i];
     int img_width = imgsize[2 * i + 1];
 
-    int obj_idx =
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4);
+    int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4,
+                                iou_aware);
     T conf = sigmoid<T>(input[obj_idx]);
+    if (iou_aware) {
+      int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num);
+      T iou = sigmoid<T>(input[iou_idx]);
+      conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
+             pow(iou, static_cast<T>(iou_aware_factor));
+    }
     if (conf < conf_thresh) {
       continue;
     }
 
-    int box_idx =
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
+    int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0,
+                                iou_aware);
     GetYoloBox<T>(box, input, anchors, l, k, j, h, w, input_size_h,
                   input_size_w, box_idx, grid_num, img_height, img_width, scale,
                   bias);
     box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
     CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width, clip_bbox);
 
-    int label_idx =
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
+    int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num,
+                                  5, iou_aware);
     int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
     CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
                       grid_num);
@@ -80,6 +87,8 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     float conf_thresh = ctx.Attr<float>("conf_thresh");
     int downsample_ratio = ctx.Attr<int>("downsample_ratio");
     bool clip_bbox = ctx.Attr<bool>("clip_bbox");
+    bool iou_aware = ctx.Attr<bool>("iou_aware");
+    float iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
     float scale = ctx.Attr<float>("scale_x_y");
     float bias = -0.5 * (scale - 1.);
 
@@ -111,11 +120,18 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     platform::GpuLaunchConfig config =
         platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num);
 
-    KeYoloBoxFw<T><<<config.block_per_grid, config.thread_per_block, 0,
+    dim3 thread_num = config.thread_per_block;
+#ifdef WITH_NV_JETSON
+    if (config.compute_capability == 53 || config.compute_capability == 62) {
+      thread_num = 512;
+    }
+#endif
+
+    KeYoloBoxFw<T><<<config.block_per_grid, thread_num, 0,
                      ctx.cuda_device_context().stream()>>>(
         input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
         anchors_data, n, h, w, an_num, class_num, box_num, input_size_h,
-        input_size_w, clip_bbox, scale, bias);
+        input_size_w, clip_bbox, scale, bias, iou_aware, iou_aware_factor);
   }
 };
 
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
index 1cfef142bca732..e06c81052a0f42 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
@@ -13,6 +13,7 @@
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
@@ -43,8 +44,19 @@ HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i,
 
 HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
                                     int an_num, int an_stride, int stride,
-                                    int entry) {
-  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+                                    int entry, bool iou_aware) {
+  if (iou_aware) {
+    return (batch * an_num + an_idx) * an_stride +
+           (batch * an_num + an_num + entry) * stride + hw_idx;
+  } else {
+    return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+  }
+}
+
+HOSTDEVICE inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num,
+                                  int an_stride, int stride) {
+  return batch * an_num * an_stride + (batch * an_num + an_idx) * stride +
+         hw_idx;
 }
 
 template <typename T>
@@ -92,6 +104,8 @@ class YoloBoxKernel : public framework::OpKernel<T> {
     float conf_thresh = ctx.Attr<float>("conf_thresh");
     int downsample_ratio = ctx.Attr<int>("downsample_ratio");
     bool clip_bbox = ctx.Attr<bool>("clip_bbox");
+    bool iou_aware = ctx.Attr<bool>("iou_aware");
+    float iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
     float scale = ctx.Attr<float>("scale_x_y");
     float bias = -0.5 * (scale - 1.);
 
@@ -127,15 +141,22 @@ class YoloBoxKernel : public framework::OpKernel<T> {
       for (int j = 0; j < an_num; j++) {
         for (int k = 0; k < h; k++) {
           for (int l = 0; l < w; l++) {
-            int obj_idx =
-                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 4);
+            int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
+                                        stride, 4, iou_aware);
             T conf = sigmoid<T>(input_data[obj_idx]);
+            if (iou_aware) {
+              int iou_idx =
+                  GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride);
+              T iou = sigmoid<T>(input_data[iou_idx]);
+              conf = pow(conf, static_cast<T>(1. - iou_aware_factor)) *
+                     pow(iou, static_cast<T>(iou_aware_factor));
+            }
             if (conf < conf_thresh) {
               continue;
             }
 
-            int box_idx =
-                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0);
+            int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
+                                        stride, 0, iou_aware);
             GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, w,
                           input_size_h, input_size_w, box_idx, stride,
                           img_height, img_width, scale, bias);
@@ -143,8 +164,8 @@ class YoloBoxKernel : public framework::OpKernel<T> {
             CalcDetectionBox<T>(boxes_data, box, box_idx, img_height, img_width,
                                 clip_bbox);
 
-            int label_idx =
-                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5);
+            int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride,
+                                          stride, 5, iou_aware);
             int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
             CalcLabelScore<T>(scores_data, input_data, label_idx, score_idx,
                               class_num, conf, stride);
diff --git a/paddle/fluid/operators/eigen/eigen_function.h b/paddle/fluid/operators/eigen/eigen_function.h
index 8cbc7cd6acd9ce..9a3be7ca439b9a 100644
--- a/paddle/fluid/operators/eigen/eigen_function.h
+++ b/paddle/fluid/operators/eigen/eigen_function.h
@@ -196,6 +196,26 @@ struct EigenRankLossGrad {
                         const InType& left, const InType& right);
 };
 
+template <typename EigenDevice, typename T>
+struct EigenLogLoss {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& pred,
+                   const InType& label, const T& epsilon);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenLogLossGrad {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType dpred, const InType& dloss,
+                   const InType& pred, const InType& label, const T& epsilon);
+};
+
 template <typename EigenDevice, typename T>
 struct EigenHingeLoss {
   using InType = Eigen::TensorMap<
diff --git a/paddle/fluid/operators/eigen/loss.cc b/paddle/fluid/operators/eigen/loss.cc
index 22a3647bc31751..469456537d9aa2 100644
--- a/paddle/fluid/operators/eigen/loss.cc
+++ b/paddle/fluid/operators/eigen/loss.cc
@@ -53,6 +53,39 @@ struct EigenRankLossGrad<Eigen::DefaultDevice, T> {
 template struct EigenRankLoss<Eigen::DefaultDevice, float>;
 template struct EigenRankLossGrad<Eigen::DefaultDevice, float>;
 
+template <typename T>
+struct EigenLogLoss<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+                   const InType& pred, const InType& label, const T& epsilon) {
+    out.device(dev) = (-(label * (pred + epsilon).log()) -
+                       ((static_cast<T>(1) - label) *
+                        (static_cast<T>(1) - pred + epsilon).log()));
+  }
+};
+
+template <typename T>
+struct EigenLogLossGrad<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType dpred,
+                   const InType& dloss, const InType& pred, const InType& label,
+                   const T& epsilon) {
+    dpred.device(dev) =
+        dloss *
+        (-(label / (pred + epsilon)) +
+         ((static_cast<T>(1) - label) / (static_cast<T>(1) - pred + epsilon)));
+  }
+};
+
+template struct EigenLogLoss<Eigen::DefaultDevice, float>;
+template struct EigenLogLossGrad<Eigen::DefaultDevice, float>;
+
 template <typename T>
 struct EigenHingeLoss<Eigen::DefaultDevice, T> {
   using InType = Eigen::TensorMap<
diff --git a/paddle/fluid/operators/eigen/loss.cu b/paddle/fluid/operators/eigen/loss.cu
index fac7e3370bcede..02341202a2b4f1 100644
--- a/paddle/fluid/operators/eigen/loss.cu
+++ b/paddle/fluid/operators/eigen/loss.cu
@@ -53,6 +53,39 @@ struct EigenRankLossGrad<Eigen::GpuDevice, T> {
 template struct EigenRankLoss<Eigen::GpuDevice, float>;
 template struct EigenRankLossGrad<Eigen::GpuDevice, float>;
 
+template <typename T>
+struct EigenLogLoss<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& pred,
+                   const InType& label, const T& epsilon) {
+    out.device(dev) = (-(label * (pred + epsilon).log()) -
+                       ((static_cast<T>(1) - label) *
+                        (static_cast<T>(1) - pred + epsilon).log()));
+  }
+};
+
+template <typename T>
+struct EigenLogLossGrad<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType dpred,
+                   const InType& dloss, const InType& pred, const InType& label,
+                   const T& epsilon) {
+    dpred.device(dev) =
+        dloss *
+        (-(label / (pred + epsilon)) +
+         ((static_cast<T>(1) - label) / (static_cast<T>(1) - pred + epsilon)));
+  }
+};
+
+template struct EigenLogLoss<Eigen::GpuDevice, float>;
+template struct EigenLogLossGrad<Eigen::GpuDevice, float>;
+
 template <typename T>
 struct EigenHingeLoss<Eigen::GpuDevice, T> {
   using InType = Eigen::TensorMap<
diff --git a/paddle/fluid/operators/eigen/pad.cu b/paddle/fluid/operators/eigen/pad.cu
index ee7d042910527a..4cf88712d95cbb 100644
--- a/paddle/fluid/operators/eigen/pad.cu
+++ b/paddle/fluid/operators/eigen/pad.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -57,6 +58,7 @@ INSTANTIATION(EigenPad, int64_t);
 INSTANTIATION(EigenPad, float);
 INSTANTIATION(EigenPad, double);
 INSTANTIATION(EigenPad, platform::float16);
+INSTANTIATION(EigenPad, platform::bfloat16);
 INSTANTIATION(EigenPad, platform::complex<float>);
 INSTANTIATION(EigenPad, platform::complex<double>);
 #undef INSTANTIATION
diff --git a/paddle/fluid/operators/eigen/slice.cu b/paddle/fluid/operators/eigen/slice.cu
index f059508394f22c..dc51fa722202bb 100644
--- a/paddle/fluid/operators/eigen/slice.cu
+++ b/paddle/fluid/operators/eigen/slice.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -57,6 +58,7 @@ INSTANTIATION(EigenSlice, int64_t);
 INSTANTIATION(EigenSlice, float);
 INSTANTIATION(EigenSlice, double);
 INSTANTIATION(EigenSlice, platform::float16);
+INSTANTIATION(EigenSlice, platform::bfloat16);
 INSTANTIATION(EigenSlice, platform::complex<float>);
 INSTANTIATION(EigenSlice, platform::complex<double>);
 #undef INSTANTIATION
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 37e5fa5a206577..aff0cb281642ec 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -28,11 +28,11 @@ namespace operators {
    1. For Unary Op, the length of input array is 1,
       e.g. Relu: return args[0] > 0 ? args[0] : 0;
    2. For Binary Op, the length of input array is 2,
-      e.g. Add: return args[0] + args[1];
+      e.g. Add: return args[0] expr args[1];
 */
 template <typename T>
 struct CudaAddFunctor {
-  __device__ __forceinline__ T operator()(const T* args) const {
+  inline HOSTDEVICE T operator()(const T* args) const {
     return args[0] + args[1];
   }
 };
@@ -42,18 +42,12 @@ class ElementwiseAddKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    axis = axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis;
-
-    std::vector<const framework::Tensor*> ins = {x, y};
-    std::vector<framework::Tensor*> outs = {z};
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
     const auto& cuda_ctx =
         ctx.template device_context<platform::CUDADeviceContext>();
 
+    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
     LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
         cuda_ctx, ins, &outs, axis, CudaAddFunctor<T>());
   }
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index ec7d036a1a1e02..a469ebbaec2edc 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -72,12 +72,10 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
     auto *z = ctx.Output<framework::LoDTensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     if (x->dims() == y->dims()) {
-      SameDimsElemwiseAdd<platform::CPUDeviceContext, T>
-          LaunchElementwiseCpuKernel;
+      SameDimsElemwiseAdd<DeviceContext, T> LaunchElementwiseCpuKernel;
       LaunchElementwiseCpuKernel(ctx, x, y, z);
     } else {
-      LaunchBroadcastElementwiseCpuKernel<platform::CPUDeviceContext, T>(ctx, x,
-                                                                         y, z);
+      LaunchBroadcastElementwiseCpuKernel<DeviceContext, T>(ctx, x, y, z);
     }
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
index 5d086a1b29febd..d4b5d98d5b0b34 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
@@ -12,9 +12,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 
 namespace ops = paddle::operators;
 
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct CudaMaxFunctor {
+  inline HOSTDEVICE T operator()(const T* args) const {
+    return (args[0] > args[1] ? args[0] : args[1]);
+  }
+};
+
+template <typename T>
+class ElementwiseMaxKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        cuda_ctx, ins, &outs, axis, CudaMaxFunctor<T>());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 REGISTER_OP_CUDA_KERNEL(
     elementwise_max,
     ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
index cf93e5a97a3f31..4a99f7e36705f0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
@@ -12,9 +12,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 
 namespace ops = paddle::operators;
 
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct CudaMinFunctor {
+  inline HOSTDEVICE T operator()(const T* args) const {
+    return (args[0] > args[1] ? args[1] : args[0]);
+  }
+};
+
+template <typename T>
+class ElementwiseMinKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        cuda_ctx, ins, &outs, axis, CudaMinFunctor<T>());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 REGISTER_OP_CUDA_KERNEL(
     elementwise_min,
     ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 8fd4609c3aa850..adcc18f837e670 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
@@ -24,37 +25,26 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-struct SameDimsElemwiseMul<platform::CUDADeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
-    MulRangeFunctor<T> functor(x->data<T>(), y->data<T>(), z->data<T>());
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
-                                                              x->numel());
-    for_range(functor);
+struct CudaMulFunctor {
+  inline HOSTDEVICE T operator()(const T* args) const {
+    return args[0] * args[1];
   }
 };
 
-template <>
-struct SameDimsElemwiseMul<platform::CUDADeviceContext, platform::float16> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
-    auto size = x->numel();
-    dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
-                              PADDLE_CUDA_THREAD_SIZE,
-                          1);
-    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
-    const half* x2 =
-        reinterpret_cast<const half*>(x->data<platform::float16>());
-    const half* y2 =
-        reinterpret_cast<const half*>(y->data<platform::float16>());
-    half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
-    SameDimsElemwiseMulCUDAKernel<<<
-        grid_size, block_size, 0,
-        ctx.template device_context<platform::CUDADeviceContext>().stream()>>>(
-        x2, y2, z2, size);
+template <typename T>
+class ElementwiseMulKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    framework::Tensor x_for_selectedrows;
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs, &x_for_selectedrows);
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        cuda_ctx, ins, &outs, axis, CudaMulFunctor<T>());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 10e69491643c92..a734f891a9d9e8 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -126,7 +126,6 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
     }
   }
 };
-
 template <typename T>
 struct MulGradDX {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
index 1492fc629457cd..541ff9aacfc462 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -343,7 +343,6 @@ template <typename InT, typename OutT, typename BroadcastArgsWarpper,
 __global__ void ElementwiseBroadcastKernel(
     BroadcastArgsWarpper broadcast_warpper, int main_tid, int tail_tid) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-
   // Vectorized calculation of major data whose length is the max multipler of
   // VecSize,
   // eg: Calcualting the front 1024-length data in total 1027 data once VecSize
@@ -466,7 +465,11 @@ void LaunchBroadcastElementwiseCudaKernel(
     const platform::CUDADeviceContext &ctx,
     const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, int axis, Functor func) {
-  static_assert(ET == (ElementwiseType)2, "Only Support binary calculation.");
+  PADDLE_ENFORCE_EQ(ET, ElementwiseType::kBinary,
+                    platform::errors::InvalidArgument(
+                        "Currently, only Support binary calculation, "
+                        "but received %d input tensors.\n",
+                        static_cast<int>(ET)));
   int in_vec_size = 4;
   framework::Tensor *out = (*outs)[0];
   for (auto *in : ins) {
@@ -501,23 +504,28 @@ void LaunchBroadcastElementwiseCudaKernel(
   }
 }
 
-template <ElementwiseType ET, typename InT, typename OutType, typename Functor>
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
 void LaunchElementwiseCudaKernel(
     const platform::CUDADeviceContext &cuda_ctx,
     const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, int axis, Functor func) {
+  std::vector<int> dims_size;
   bool no_broadcast_flag = true;
   for (auto *in : ins) {
     no_broadcast_flag = ins[0]->dims() == in->dims();
+    dims_size.emplace_back(in->dims().size());
   }
 
   if (no_broadcast_flag) {
-    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, InT, OutType>(
-        cuda_ctx, ins, outs, func);
+    LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT>(cuda_ctx, ins, outs,
+                                                       func);
   } else {
-    LaunchBroadcastElementwiseCudaKernel<ElementwiseType::kBinary, InT,
-                                         OutType>(cuda_ctx, ins, outs, axis,
-                                                  func);
+    axis = axis == -1
+               ? *std::max_element(dims_size.begin(), dims_size.end()) -
+                     *std::min_element(dims_size.begin(), dims_size.end())
+               : axis;
+    LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT>(cuda_ctx, ins, outs,
+                                                        axis, func);
   }
 }
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 32e49cf3996f12..d09e777670990a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -60,6 +60,71 @@ constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
 namespace paddle {
 namespace operators {
 
+/*
+*  Pack input and output tensors into respective vectors with
+*  consideration of varible X`s class type.
+*  Input variable X is supported to be whether LoDTensor or
+*  SelectedRows class type in this package function, once X
+*  was SelectedRows type, a valid pointer x_for_selectedrows
+*  is excepted to be passed in from op kernel for acquisition
+*  of the valid address of LoDTensor created ahead in the function.
+*/
+template <typename OutT>
+int PackTensorsIntoVector(const framework::ExecutionContext &ctx,
+                          std::vector<const framework::Tensor *> *ins,
+                          std::vector<framework::Tensor *> *outs,
+                          framework::Tensor *x_for_selectedrows = nullptr) {
+  int axis = -1;
+  auto x_var = ctx.InputVar("X");
+  PADDLE_ENFORCE_NOT_NULL(
+      x_var, platform::errors::InvalidArgument(
+                 "Unable to get input Variable X, Variable name is %s.\n",
+                 ctx.InputName("X")));
+  auto *y = ctx.Input<framework::LoDTensor>("Y");
+  framework::Tensor *z;
+
+  if (x_var->IsType<framework::LoDTensor>()) {
+    auto *x = ctx.Input<framework::LoDTensor>("X");
+    z = ctx.Output<framework::LoDTensor>("Out");
+    ins->emplace_back(x);
+  } else if (x_var->IsType<framework::SelectedRows>()) {
+    PADDLE_ENFORCE_EQ(y->dims().size() == 1 && y->dims()[0] == 1, true,
+                      platform::errors::InvalidArgument(
+                          "For elementwise_op, if X is Sparse, Y must be "
+                          "scalar. But reveived the size of Y = %d.",
+                          y->dims().size()));
+    PADDLE_ENFORCE_NOT_NULL(
+        x_for_selectedrows,
+        platform::errors::InvalidArgument(
+            "The parameter x_for_selectedrows is excepted to "
+            "be valid, once input varible X`s class type is "
+            "SelectedRows.\n"));
+    auto &x_sele = x_var->Get<framework::SelectedRows>();
+    auto out_sele = ctx.Output<framework::SelectedRows>("Out");
+    *x_for_selectedrows = x_sele.value();
+    out_sele->set_rows(x_sele.rows());
+    out_sele->set_height(x_sele.height());
+    out_sele->mutable_value()->Resize(x_sele.value().dims());
+    out_sele->mutable_value()->mutable_data(ctx.GetPlace(),
+                                            x_for_selectedrows->type());
+    z = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
+    ins->emplace_back(x_for_selectedrows);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "X's type[%s] is not supported by elementwise_op. X's type should be "
+        "LoDTensor or SelectedRows.",
+        framework::ToTypeName(x_var->Type())));
+  }
+  z->mutable_data<OutT>(ctx.GetPlace());
+  outs->emplace_back(z);
+
+  if (y != nullptr) {
+    ins->emplace_back(y);
+    axis = ctx.HasAttr("axis") ? ctx.Attr<int>("axis") : -1;
+  }
+  return axis;
+}
+
 /*
  * Out = X ⊙ Y
  * If Y's shape does not match X' shape, they will be reshaped.
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
index 320d1e7b38da8e..5335f274ef126f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
@@ -8,10 +8,52 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
 
 namespace ops = paddle::operators;
 
+namespace paddle {
+namespace operators {
+
+template <typename T, typename Enable = void>
+struct CudaPowFunctor {
+  inline HOSTDEVICE T operator()(const T args[]) const {
+    return std::pow(args[0], args[1]);
+  }
+};
+
+template <typename T>
+struct CudaPowFunctor<
+    T, typename std::enable_if<std::is_integral<T>::value>::type> {
+  // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
+  // it will return a float number like 2.99... , which floor to 2
+  // when cast to int by default and it is wrong.
+  // Use llrint to cast it to the nearest integer, which is 3.
+  inline HOSTDEVICE T operator()(const T args[]) const {
+    return std::llrint(std::pow(args[0], args[1]));
+  }
+};
+
+template <typename T>
+class ElementwisePowKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        cuda_ctx, ins, &outs, axis, CudaPowFunctor<T>());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 REGISTER_OP_CUDA_KERNEL(
     elementwise_pow,
     ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index 19cbbb7bf04287..da9610243f7c4d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -11,8 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
@@ -24,37 +23,25 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-struct SameDimsElemwiseSub<platform::CUDADeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
-    SubRangeFunctor<T> functor(x->data<T>(), y->data<T>(), z->data<T>());
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
-                                                              x->numel());
-    for_range(functor);
+struct CudaSubFunctor {
+  inline HOSTDEVICE T operator()(const T* args) const {
+    return args[0] - args[1];
   }
 };
 
-template <>
-struct SameDimsElemwiseSub<platform::CUDADeviceContext, platform::float16> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
-    auto size = x->numel();
-    dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
-                              PADDLE_CUDA_THREAD_SIZE,
-                          1);
-    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
-    const half* x2 =
-        reinterpret_cast<const half*>(x->data<platform::float16>());
-    const half* y2 =
-        reinterpret_cast<const half*>(y->data<platform::float16>());
-    half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
-    SameDimsElemwiseSubCUDAKernel<<<
-        grid_size, block_size, 0,
-        ctx.template device_context<platform::CUDADeviceContext>().stream()>>>(
-        x2, y2, z2, size);
+template <typename T>
+class ElementwiseSubKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        cuda_ctx, ins, &outs, axis, CudaSubFunctor<T>());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 4171d2eb9e5e53..42609341327609 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -11,8 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
index 3656de3525d32c..ea1bca8b4d58df 100644
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -209,6 +209,73 @@ __global__ void LayerNormForward(const T *x, const U *scale, const U *bias,
   }
 }
 
+template <typename T, typename U, int BlockDim>
+__global__ void LayerNormForwardFP16(const T *x, const U *scale, const U *bias,
+                                     T *y, U *mean, U *var, float epsilon,
+                                     int feature_size) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+  using BlockReduce = cub::BlockReduce<PairForLayerNorm<U>, BlockDim>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ U mean_share;
+  __shared__ U var_share;
+
+  int beg_idx = blockIdx.x * feature_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * feature_size;
+
+  // Step 1: Reduce to calculate mean and var
+  U mean_val = 0;
+  U var_val = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    U tmp = static_cast<U>(x[i]);
+    mean_val += tmp;
+    var_val += (tmp * tmp);
+  }
+  auto pair = BlockReduce(temp_storage)
+                  .Reduce(PairForLayerNorm<U>(mean_val, var_val),
+                          PairForLayerNormAddFunctor<U>());
+  if (threadIdx.x == 0) {
+    auto tmp = pair.first_ / static_cast<U>(feature_size);
+    mean[blockIdx.x] = mean_share = static_cast<U>(tmp);
+    var[blockIdx.x] = var_share =
+        static_cast<U>(pair.second_ / static_cast<U>(feature_size) - tmp * tmp);
+  }
+  __syncthreads();
+
+  mean_val = mean_share;
+  U invvar = rsqrt_<U>(var_share + static_cast<U>(epsilon));
+
+  // Step 2: Calculate y
+  if (scale != nullptr) {
+    if (bias != nullptr) {
+      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
+           i += BlockDim, j += BlockDim) {
+        y[i] = static_cast<T>(
+            scale[j] * (static_cast<U>(x[i]) - mean_val) * invvar + bias[j]);
+      }
+    } else {
+      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
+           i += BlockDim, j += BlockDim) {
+        y[i] = static_cast<T>(scale[j] * (static_cast<U>(x[i]) - mean_val) *
+                              invvar);
+      }
+    }
+  } else {  // scale == nullptr
+    if (bias != nullptr) {
+      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
+           i += BlockDim, j += BlockDim) {
+        y[i] = static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar +
+                              bias[j]);
+      }
+    } else {
+      for (int i = beg_idx, j = threadIdx.x; i < end_idx;
+           i += BlockDim, j += BlockDim) {
+        y[i] = static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar);
+      }
+    }
+  }
+#endif
+}
+
 template <typename T, typename U, int VPT>
 __inline__ __device__ void cuLoadAddStridedInputs(
     const int i1_block, const int thr_load_row_off, const int thr_load_col_off,
@@ -872,6 +939,28 @@ void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
   }
 }
 
+template <>
+void LayerNormDirectCUDAFunctor<half>::operator()(
+    gpuStream_t stream, const half *input, std::vector<int> input_shape,
+    const half *bias, const half *scale, half *output, half *mean,
+    half *variance, int begin_norm_axis, float eps) {
+  const auto x_dims = framework::make_ddim(input_shape);
+  auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+  int batch_size = static_cast<int>(matrix_dim[0]);
+  int feature_size = static_cast<int>(matrix_dim[1]);
+  switch (GetDesiredBlockDim(feature_size)) {
+    FIXED_BLOCK_DIM_CASE(
+        LayerNormForwardFP16<half, half,
+                             kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
+            input, scale, bias, output, mean, variance, eps, feature_size));
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Product from begin_norm_axis to end in layer_norm must be larger "
+          "than 1"));
+      break;
+  }
+}
+
 template <typename T>
 class LayerNormKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -961,6 +1050,9 @@ class LayerNormGradKernel<platform::CUDADeviceContext, T>
 };
 
 template class LayerNormDirectCUDAFunctor<float>;
+#ifdef TRT_PLUGIN_FP16_AVALIABLE
+template class LayerNormDirectCUDAFunctor<half>;
+#endif
 
 #undef FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE
 #undef FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index 1569512dc74f72..c41805d41cef46 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -154,3 +154,8 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
     log_loss_grad,
     ops::LogLossGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    log_loss_grad,
+    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/log_loss_op.h b/paddle/fluid/operators/log_loss_op.h
index e62de17a986031..e7985ab810b138 100644
--- a/paddle/fluid/operators/log_loss_op.h
+++ b/paddle/fluid/operators/log_loss_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace paddle {
 namespace operators {
@@ -40,9 +41,8 @@ class LogLossKernel : public framework::OpKernel<T> {
     auto loss = EigenVector<T>::Flatten(*loss_out);
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
-    loss.device(place) = (-(label * (prediction + epsilon).log()) -
-                          ((static_cast<T>(1) - label) *
-                           (static_cast<T>(1) - prediction + epsilon).log()));
+    EigenLogLoss<std::decay_t<decltype(place)>, T>::Eval(
+        place, loss, prediction, label, epsilon);
   }
 };
 
@@ -64,9 +64,8 @@ class LogLossGradKernel : public framework::OpKernel<T> {
     if (dpred) {
       dpred->mutable_data<T>(ctx.GetPlace());
       auto dx = framework::EigenVector<T>::Flatten(*dpred);
-      dx.device(place) = dl * (-(label / (prediction + epsilon)) +
-                               ((static_cast<T>(1) - label) /
-                                (static_cast<T>(1) - prediction + epsilon)));
+      EigenLogLossGrad<std::decay_t<decltype(place)>, T>::Eval(
+          place, dx, dl, prediction, label, epsilon);
     }
   }
 };
diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
index e4fe92c625640d..12c607adb44f4e 100644
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ b/paddle/fluid/operators/log_softmax_op.cu
@@ -104,7 +104,7 @@ __global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src,
 #pragma unroll
   for (int it = 0; it < warp_iter; ++it) {
     int element_index = thread_in_warp_idx + it * kernel_warp_size;
-    if (element_index < element_count) {
+    if (element_index < effective_element_count) {
       dst[batch_id * element_count + element_index] =
           static_cast<T>(elements[it] - max_value - sum);
     } else {
@@ -226,7 +226,7 @@ __global__ void ComputeLogSoftmaxBackwardInWarp(const T *output,
 #pragma unroll
   for (int iter = 0; iter < warp_iter; ++iter) {
     int element_index = thread_in_warp_idx + iter * kernel_warp_size;
-    if (element_index < element_count) {
+    if (element_index < effective_element_count) {
       grad_input[batch_id * element_count + element_index] = static_cast<T>(
           (grad_output_register[iter] - std::exp(output_register[iter]) * sum));
     }
diff --git a/paddle/fluid/operators/masked_select_op.cc b/paddle/fluid/operators/masked_select_op.cc
index 3b44c02757fae9..17bf5df18adc54 100644
--- a/paddle/fluid/operators/masked_select_op.cc
+++ b/paddle/fluid/operators/masked_select_op.cc
@@ -26,8 +26,9 @@ class MaskedSelectOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Input", "MaskedSelect");
     OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "MaskedSelect");
     OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Out", "MaskedSelect");
-    framework::DDim output_dims(ctx->GetInputDim("X"));
-    ctx->SetOutputDim("Y", output_dims);
+
+    // output will only be a 1-D Tensor
+    ctx->SetOutputDim("Y", framework::make_ddim({-1}));
     ctx->ShareLoD("X", /*->*/ "Y");
   }
 
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu
index 512f9c62415e5d..4d7218cd89e04b 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.cu
+++ b/paddle/fluid/operators/math/bert_encoder_functor.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
@@ -311,6 +312,156 @@ __global__ void SoftmaxKernelWithEltadd2<half2>(
 #endif
 }
 
+template <typename T>
+__global__ void SoftmaxKernelWithEltaddForLarge(T *qk_buf, const T *bias_qk,
+                                                const int batch_size,
+                                                const int head_num,
+                                                const int seq_len,
+                                                const unsigned mask) {
+  int qk_offset = blockIdx.x * seq_len;
+  assert(blockDim.x % 32 == 0);
+
+  T stride_max = -1e20f;
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    stride_max = qk_buf[threadIdx.x + i + qk_offset] +
+                             bias_qk[threadIdx.x + i + qk_offset] >
+                         stride_max
+                     ? qk_buf[threadIdx.x + i + qk_offset] +
+                           bias_qk[threadIdx.x + i + qk_offset]
+                     : stride_max;
+  }
+  T max_val = blockReduceMax<T>(stride_max, mask);
+
+  T stride_sum = 0.f;
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    stride_sum += __expf(qk_buf[threadIdx.x + i + qk_offset] +
+                         bias_qk[threadIdx.x + i + qk_offset] - max_val);
+  }
+  T sum_val = blockReduceSum<T>(stride_sum, mask);
+
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    qk_buf[threadIdx.x + i + qk_offset] =
+        (T)(__expf(qk_buf[threadIdx.x + i + qk_offset] +
+                   bias_qk[threadIdx.x + i + qk_offset] - max_val) /
+            sum_val);
+  }
+}
+
+// HIP defined __HIP_NO_HALF_CONVERSIONS__
+#ifndef __HIPCC__  // @{ Half kernel: SoftmaxKernelWithEltadd
+template <>
+__global__ void SoftmaxKernelWithEltaddForLarge(
+    half *qk_buf, const half *bias_qk, const int batch_size, const int head_num,
+    const int seq_len, const unsigned mask) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+  int qk_offset = blockIdx.x * seq_len;
+  assert(blockDim.x % 32 == 0);
+
+  float stride_max = -1e20f;
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float tmp = static_cast<float>(qk_buf[threadIdx.x + i + qk_offset] +
+                                   bias_qk[threadIdx.x + i + qk_offset]);
+    stride_max = tmp > stride_max ? tmp : stride_max;
+  }
+  float max_val = blockReduceMax<float>(stride_max, mask);
+
+  float stride_sum = 0.f;
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float tmp = static_cast<float>(qk_buf[threadIdx.x + i + qk_offset] +
+                                   bias_qk[threadIdx.x + i + qk_offset]);
+    stride_sum += __expf(tmp - max_val);
+  }
+  float sum_val = blockReduceSum<float>(stride_sum, mask);
+
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float tmp =
+        __expf(static_cast<float>(qk_buf[threadIdx.x + i + qk_offset] +
+                                  bias_qk[threadIdx.x + i + qk_offset]) -
+               max_val);
+    qk_buf[threadIdx.x + i + qk_offset] = (half)(tmp / sum_val);
+  }
+#endif
+}
+#endif  // @} End Half kernel: SoftmaxKernelWithEltadd
+
+template <typename T>
+__global__ void SoftmaxKernelWithEltaddForLarge2(T *qk_buf_, const T *bias_qk_,
+                                                 const int batch_size,
+                                                 const int head_num,
+                                                 const int seq_len,
+                                                 const unsigned mask) {
+  int qk_offset = blockIdx.x * seq_len;
+  assert(blockDim.x % 32 == 0);
+
+  float2 stride_max = make_float2(-1e20f, -1e20f);
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
+                             bias_qk_[threadIdx.x + i + qk_offset]);
+    stride_max.x = max(stride_max.x, cur.x);
+    stride_max.y = max(stride_max.y, cur.y);
+  }
+  float max_val = blockReduceMax<float>(max(stride_max.x, stride_max.y), mask);
+
+  float2 stride_sum = make_float2(0.f, 0.f);
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
+                             bias_qk_[threadIdx.x + i + qk_offset]);
+    stride_sum.x += __expf(cur.x - max_val);
+    stride_sum.y += __expf(cur.y - max_val);
+  }
+
+  float sum_val =
+      blockReduceSum<float>(stride_sum.x + stride_sum.y, mask) + 1e-6f;
+
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
+                             bias_qk_[threadIdx.x + i + qk_offset]);
+    qk_buf_[threadIdx.x + i + qk_offset] = FloatsToPair<T>(
+        __expf(cur.x - max_val) / sum_val, __expf(cur.y - max_val) / sum_val);
+  }
+}
+
+template <>
+__global__ void SoftmaxKernelWithEltaddForLarge2(
+    half2 *qk_buf_, const half2 *bias_qk_, const int batch_size,
+    const int head_num, const int seq_len, const unsigned mask) {
+// operator "+" of half only suppotted after cuda version 10.0
+// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
+#if defined(PADDLE_WITH_CUDA) && \
+    (CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) && CUDA_VERSION >= 10000)
+
+  int qk_offset = blockIdx.x * seq_len;
+  assert(blockDim.x % 32 == 0);
+
+  float2 stride_max = make_float2(-1e20f, -1e20f);
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
+                                 bias_qk_[threadIdx.x + i + qk_offset]);
+    stride_max.x = max(stride_max.x, cur.x);
+    stride_max.y = max(stride_max.y, cur.y);
+  }
+  float max_val = blockReduceMax<float>(max(stride_max.x, stride_max.y), mask);
+
+  float2 stride_sum = make_float2(0.f, 0.f);
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
+                                 bias_qk_[threadIdx.x + i + qk_offset]);
+    stride_sum.x += __expf(cur.x - max_val);
+    stride_sum.y += __expf(cur.y - max_val);
+  }
+
+  float sum_val =
+      blockReduceSum<float>(stride_sum.x + stride_sum.y, mask) + 1e-6f;
+
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
+                                 bias_qk_[threadIdx.x + i + qk_offset]);
+    qk_buf_[threadIdx.x + i + qk_offset] = FloatsToPair<half2>(
+        __expf(cur.x - max_val) / sum_val, __expf(cur.y - max_val) / sum_val);
+  }
+#endif
+}
+
 template <typename T>
 inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context,
                              int head_num, int seq_len, int size_per_head,
@@ -332,31 +483,48 @@ inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context,
       reinterpret_cast<run_type *>(qk_buf_), batch_size * head_num,
       seq_len * size_per_head, seq_len * size_per_head);
 
-  int grid = batch_size * head_num * seq_len;
-  int block = seq_len;
-
-  // Align block to 32, also limit seq_len to max block size.
-  PADDLE_ENFORCE_LE(seq_len, 1024, platform::errors::InvalidArgument(
-                                       "seq_len should <= 1024, "
-                                       "but received seq_len is:%d",
-                                       seq_len));
-  if (seq_len % 2 == 0) {
-    block = (seq_len <= 64) ? 32 : ((seq_len + 63) / 64) * 32;
-    if (std::is_same<T, float>::value) {
-      SoftmaxKernelWithEltadd2<float2><<<grid, block, 0, stream>>>(
-          reinterpret_cast<float2 *>(qk_buf_),
-          reinterpret_cast<const float2 *>(bias_qk), batch_size, head_num,
-          seq_len / 2, FINAL_MASK);
+  if (seq_len <= 1024) {
+    int grid = batch_size * head_num * seq_len;
+    int block = seq_len;
+
+    // Align block to 32, also limit seq_len to max block size.
+    if (seq_len % 2 == 0) {
+      block = (seq_len <= 64) ? 32 : ((seq_len + 63) / 64) * 32;
+      if (std::is_same<T, float>::value) {
+        SoftmaxKernelWithEltadd2<float2><<<grid, block, 0, stream>>>(
+            reinterpret_cast<float2 *>(qk_buf_),
+            reinterpret_cast<const float2 *>(bias_qk), batch_size, head_num,
+            seq_len / 2, FINAL_MASK);
+      } else {
+        SoftmaxKernelWithEltadd2<__half2><<<grid, block, 0, stream>>>(
+            reinterpret_cast<__half2 *>(qk_buf_),
+            reinterpret_cast<const __half2 *>(bias_qk), batch_size, head_num,
+            seq_len / 2, FINAL_MASK);
+      }
     } else {
-      SoftmaxKernelWithEltadd2<__half2><<<grid, block, 0, stream>>>(
-          reinterpret_cast<__half2 *>(qk_buf_),
-          reinterpret_cast<const __half2 *>(bias_qk), batch_size, head_num,
-          seq_len / 2, FINAL_MASK);
+      block = (seq_len <= 32) ? 32 : ((seq_len + 31) / 32) * 32;
+      SoftmaxKernelWithEltadd<T><<<grid, block, 0, stream>>>(
+          qk_buf_, bias_qk, batch_size, head_num, seq_len, FINAL_MASK);
     }
   } else {
-    block = (seq_len <= 32) ? 32 : ((seq_len + 31) / 32) * 32;
-    SoftmaxKernelWithEltadd<T><<<grid, block, 0, stream>>>(
-        qk_buf_, bias_qk, batch_size, head_num, seq_len, FINAL_MASK);
+    int grid = batch_size * head_num * seq_len;
+    int block = 512;
+    if (seq_len % 2 == 0) {
+      if (std::is_same<T, float>::value) {
+        SoftmaxKernelWithEltaddForLarge2<float2><<<grid, block, 0, stream>>>(
+            reinterpret_cast<float2 *>(qk_buf_),
+            reinterpret_cast<const float2 *>(bias_qk), batch_size, head_num,
+            seq_len / 2, FINAL_MASK);
+      } else {
+        SoftmaxKernelWithEltaddForLarge2<__half2><<<grid, block, 0, stream>>>(
+            reinterpret_cast<__half2 *>(qk_buf_),
+            reinterpret_cast<const __half2 *>(bias_qk), batch_size, head_num,
+            seq_len / 2, FINAL_MASK);
+      }
+    } else {
+      SoftmaxKernelWithEltaddForLarge<T><<<grid, block, 0, stream>>>(
+          qk_buf_, bias_qk, batch_size, head_num, seq_len, FINAL_MASK);
+    }
   }
 }
 
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index 011c85caf04bbb..c8e2acea451a47 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -437,6 +437,8 @@ void TestConcatMain() {
   ConcatCase2<DeviceContext, Place>(context);
   ConcatCase3<DeviceContext, Place>(context);
   ConcatCase4<DeviceContext, Place>(context);
+
+  delete context;
 }
 
 TEST(math, concat) {
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index 3388d7edafecc4..32f9938dcacfbb 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -208,6 +208,7 @@ void GemvTest(int m, int n, bool trans) {
       ASSERT_FLOAT_EQ(data_c[i], sum);
     }
   }
+  delete cpu_place;
 }
 
 TEST(math_function, gemv) {
@@ -274,6 +275,7 @@ void GemmWarpTest(int m, int n, int k, T alpha, T beta) {
   for (int i = 0; i < mat_c_mkl.numel(); ++i) {
     EXPECT_FLOAT_EQ(CREF[i], CMKL[i]);
   }
+  delete cpu_place;
 }
 
 TEST(math_function, gemm_warp) {
diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc
index cc3b838cbcf1d7..5a8e7fcc2a76c2 100644
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -116,6 +116,9 @@ void testVol2col() {
   for (int i = 0; i < 12; ++i) {
     EXPECT_EQ(in_ptr[i], col_2_vol[i]);
   }
+
+  delete place;
+  delete context;
 }
 
 TEST(math, vol2col) {
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
old mode 100755
new mode 100644
index 479f9643749d63..8f30dd5b2e68a4
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
@@ -34,6 +34,7 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("LearningRate",
              "(LoDTensor, default LoDTensor<float>) "
              "Input learning rate");
+    AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
 
     AddOutput("ParamOut",
               "(LoDTensor) This output is updated parameter. "
@@ -41,6 +42,10 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("VelocityOut",
               "(LoDTensor) This output is updated velocity. "
               "It shared memory with Input(Velocity).");
+    AddOutput("MasterParamOut",
+              "The updated FP32 master weight for AMP. "
+              "It shared memory with Input(MasterParam).")
+        .AsDispensable();
 
     AddAttr<float>("mu", "(float) Momentum coefficient");
     AddAttr<float>("lars_coeff", "(float, default 0.001) LARS coefficient.")
@@ -51,6 +56,15 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("epsilon",
                    "(float, default 0.0) epsilon to avoid Division by Zero.")
         .SetDefault(0.0);
+    AddAttr<bool>("multi_precision",
+                  "(bool, default false) "
+                  "Whether to use multi-precision during weight updating.")
+        .SetDefault(false);
+    AddAttr<float>(
+        "rescale_grad",
+        "(float, default 1.0) Multiply the gradient with `rescale_grad`"
+        "before updating. Often choose to be `1.0/batch_size`.")
+        .SetDefault(1.0f);
 
     AddComment(R"DOC(
 Lars Momentum Optimizer.
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
index eb0111ae4de2f0..42477232e7ca1b 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -13,36 +13,64 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename T>
-__global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
-                                   const T* learning_rate, const T mu,
-                                   const int64_t num, const T lars_coeff,
-                                   const T lars_weight_decay, const T* p_norm,
-                                   const T* g_norm, T* p_out, T* v_out,
-                                   const T epsilon) {
-  T lr = learning_rate[0];
-  T local_lr = learning_rate[0];
+using MultiPrecisionType = typename details::MPTypeTrait<T>::Type;
+
+template <typename T, typename MT>
+__global__ void MomentumLarsKernel(
+    const T* p, const T* g, const MT* v,
+    const MultiPrecisionType<T>* learning_rate, const MT mu, const int64_t num,
+    const MT lars_coeff, const MT lars_weight_decay,
+    const MultiPrecisionType<T>* p_norm, const MultiPrecisionType<T>* g_norm,
+    T* p_out, MT* v_out, const MT epsilon, const MT* master_p, MT* master_p_out,
+    const MultiPrecisionType<T> rescale_grad) {
+  const MT lr = static_cast<MT>(learning_rate[0]);
+  MT local_lr = lr;
+  const MT p_n = static_cast<MT>(p_norm[0]);
+  const MT g_n = static_cast<MT>(g_norm[0]);
+
+  if (lars_weight_decay > static_cast<MT>(0) && p_n > static_cast<MT>(0) &&
+      g_n > static_cast<MT>(0)) {
+    local_lr =
+        lr * lars_coeff * p_n / (g_n + lars_weight_decay * p_n + epsilon);
+  }
   CUDA_KERNEL_LOOP(i, num) {
-    if (lars_weight_decay > 0 && p_norm[0] > 0 && g_norm[0] > 0) {
-      local_lr = lr * lars_coeff * p_norm[0] /
-                 (g_norm[0] + lars_weight_decay * p_norm[0] + epsilon);
-    }
+    MT grad = static_cast<MT>(g[i]) * static_cast<MT>(rescale_grad);
+    MT param = master_p ? master_p[i] : static_cast<MT>(p[i]);
+
+    MT v_new = v[i] * mu + local_lr * (grad + lars_weight_decay * param);
+    MT p_new = param - v_new;
 
-    T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
     v_out[i] = v_new;
-    p_out[i] = p[i] - v_new;
+    p_out[i] = static_cast<T>(p_new);
+    if (master_p_out) master_p_out[i] = p_new;
   }
 }
 
 template <typename DeviceContext, typename T>
 class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
+  using MPDType = MultiPrecisionType<T>;
+
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const bool multi_precision = ctx.Attr<bool>("multi_precision");
+    if (multi_precision) {
+      InnerCompute<MPDType>(ctx, multi_precision);
+    } else {
+      InnerCompute<T>(ctx, multi_precision);
+    }
+  }
+
+ private:
+  template <typename MT>
+  void InnerCompute(const framework::ExecutionContext& ctx,
+                    const bool multi_precision) const {
     auto param_out = ctx.Output<framework::LoDTensor>("ParamOut");
     auto velocity_out = ctx.Output<framework::LoDTensor>("VelocityOut");
     auto param = ctx.Input<framework::LoDTensor>("Param");
@@ -50,18 +78,40 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
     auto grad = ctx.Input<framework::LoDTensor>("Grad");
     auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
 
+    const framework::Tensor* master_param = nullptr;
+    framework::Tensor* master_param_out = nullptr;
+    if (multi_precision) {
+      bool has_master =
+          ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
+      PADDLE_ENFORCE_EQ(has_master, true,
+                        platform::errors::InvalidArgument(
+                            "The Input(MasterParam) and Output(MasterParamOut) "
+                            "should not be null when "
+                            "the attr `multi_precision` is true"));
+      master_param = ctx.Input<framework::Tensor>("MasterParam");
+      master_param_out = ctx.Output<framework::Tensor>("MasterParamOut");
+    }
+
+    const MT* master_p = multi_precision ? master_param->data<MT>() : nullptr;
+    MT* master_p_out = multi_precision
+                           ? master_param_out->mutable_data<MT>(ctx.GetPlace())
+                           : nullptr;
+
     T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
-    T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
+    MT* v_out = velocity_out->mutable_data<MT>(ctx.GetPlace());
 
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    T lars_coeff = ctx.Attr<float>("lars_coeff");
-    T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
-    T epsilon = ctx.Attr<float>("epsilon");
+    MT mu = static_cast<MT>(ctx.Attr<float>("mu"));
+    MT lars_coeff = static_cast<MT>(ctx.Attr<float>("lars_coeff"));
+    MT lars_weight_decay =
+        static_cast<MT>(ctx.Attr<float>("lars_weight_decay"));
+    MT epsilon = static_cast<MT>(ctx.Attr<float>("epsilon"));
+    MPDType rescale_grad =
+        static_cast<MPDType>(ctx.Attr<float>("rescale_grad"));
 
     auto* p = param->data<T>();
-    auto* v = velocity->data<T>();
     auto* g = grad->data<T>();
-    auto* lr = learning_rate->data<T>();
+    auto* v = velocity->data<MT>();
+    auto* lr = learning_rate->data<MPDType>();
 
     int block = 512;
     int grid = (param->numel() + block - 1) / block;
@@ -72,17 +122,24 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
     framework::Tensor p_norm_t, g_norm_t;
     p_norm_t.Resize({1});
     g_norm_t.Resize({1});
-    auto* p_norm_data = p_norm_t.mutable_data<T>(ctx.GetPlace());
-    auto* g_norm_data = g_norm_t.mutable_data<T>(ctx.GetPlace());
-    auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
-    auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
+    auto* p_norm_data = p_norm_t.mutable_data<MPDType>(ctx.GetPlace());
+    auto* g_norm_data = g_norm_t.mutable_data<MPDType>(ctx.GetPlace());
+    auto ep_norm = framework::EigenScalar<MPDType>::From(p_norm_t);
+    auto eg_norm = framework::EigenScalar<MPDType>::From(g_norm_t);
 
     auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-    ep_norm.device(*place) = eigen_p.square().sum().sqrt();
-    eg_norm.device(*place) = eigen_g.square().sum().sqrt();
-    MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
+
+    // eigen unsupport fp16 l2-norm
+    ep_norm.device(*place) =
+        eigen_p.template cast<MPDType>().square().sum().sqrt();
+    eg_norm.device(*place) =
+        (eigen_g.template cast<MPDType>() * rescale_grad).square().sum().sqrt();
+
+    MomentumLarsKernel<
+        T, MT><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
         p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
-        p_norm_data, g_norm_data, p_out, v_out, epsilon);
+        p_norm_data, g_norm_data, p_out, v_out, epsilon, master_p, master_p_out,
+        rescale_grad);
   }
 };
 
@@ -93,4 +150,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     lars_momentum,
     ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::float16>);
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index cbb0704fa857b7..f461dec66c0e75 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -135,6 +135,9 @@ class MomentumOp : public framework::OperatorWithKernel {
 
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("VelocityOut", param_dim);
+    if (ctx->HasOutput("MasterParamOut")) {
+      ctx->SetOutputDim("MasterParamOut", param_dim);
+    }
   }
 
   framework::OpKernelType GetExpectedKernelType(
diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc
index c2f68675beb621..ce6db633c9566e 100644
--- a/paddle/fluid/operators/py_layer_op.cc
+++ b/paddle/fluid/operators/py_layer_op.cc
@@ -62,13 +62,22 @@ void RunPyObject(py::object *py_object,
     for (size_t i = 0; i < result_tuple.size(); i++) {
       if ((*outs)[i] != nullptr) {
         if (Py_None != result_tuple[i].ptr()) {
-          try {
-            auto result_var =
-                result_tuple[i].cast<std::shared_ptr<imperative::VarBase>>();
-            *(*outs)[i] = result_var->Var();
-          } catch (py::cast_error &) {
+          if (py::isinstance<imperative::VarBase>(result_tuple[i])) {
+            try {
+              auto result_var =
+                  result_tuple[i].cast<std::shared_ptr<imperative::VarBase>>();
+              *(*outs)[i] = result_var->Var();
+            } catch (py::cast_error &) {
+              PADDLE_THROW(platform::errors::InvalidArgument(
+                  "The `PyLayer.backward` function returns invalid argument, "
+                  "the `%s` type argument can not be cast into `Tensor`.",
+                  result_tuple[i].ptr()->ob_type->tp_name));
+            }
+          } else {
             PADDLE_THROW(platform::errors::InvalidArgument(
-                "The output of `PyLayer.backward` should be `Tensor`."));
+                "The output of `PyLayer.backward` should be `Tensor`, but "
+                "received `%s`.",
+                result_tuple[i].ptr()->ob_type->tp_name));
           }
         } else {
           PADDLE_THROW(platform::errors::InvalidArgument(
@@ -94,13 +103,22 @@ void RunPyObject(py::object *py_object,
     }
     if ((*outs)[0] != nullptr) {
       if (Py_None != py_result.ptr()) {
-        try {
-          auto result_var =
-              py_result.cast<std::shared_ptr<imperative::VarBase>>();
-          *((*outs)[0]) = result_var->Var();
-        } catch (py::cast_error &) {
+        if (py::isinstance<imperative::VarBase>(py_result)) {
+          try {
+            auto result_var =
+                py_result.cast<std::shared_ptr<imperative::VarBase>>();
+            *((*outs)[0]) = result_var->Var();
+          } catch (py::cast_error &) {
+            PADDLE_THROW(platform::errors::InvalidArgument(
+                "The `PyLayer.backward` function returns invalid argument, the "
+                "`%s` type argument can not be cast into `Tensor`.",
+                py_result.ptr()->ob_type->tp_name));
+          }
+        } else {
           PADDLE_THROW(platform::errors::InvalidArgument(
-              "The output of `PyLayer.backward` should be `Tensor`."));
+              "The output of `PyLayer.backward` should be `Tensor`, but "
+              "received `%s`",
+              py_result.ptr()->ob_type->tp_name));
         }
       } else {
         PADDLE_THROW(platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index f5d55791d86c68..17c84530b23e66 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -68,7 +68,6 @@ BufferedReader::BufferedReader(
     stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx);
   }
 #endif
-  is_same_place_ = false;
   cpu_buffer_.resize(buffer_size);
   cuda_buffer_.resize(buffer_size);
   npu_buffer_.resize(buffer_size);
@@ -116,7 +115,7 @@ void BufferedReader::ReadAsync(size_t i) {
         std::vector<void *> cuda_pinned_ptrs;
         cuda_pinned_ptrs.reserve(cpu.size());
         platform::RecordEvent record_event("BufferedReader:MemoryCopy");
-        // NODE(chenwehiang): When we use CUDAPinned Memory, we need call
+        // NODE(chenweihang): When we use CUDAPinned Memory, we need call
         // cudaHostAlloc, that is a CUDA API, calling CUDA API need load
         // cuda lib into device, it will cost hundreds of MB of GPU memory.
         // If we don't set Device here, which will use CUDAPlace(0) default.
@@ -126,18 +125,21 @@ void BufferedReader::ReadAsync(size_t i) {
           if (platform::is_cpu_place(cpu[i].place())) {
             cuda[i].Resize(cpu[i].dims());
             cuda[i].set_layout(cpu[i].layout());
-            cuda_pinned_ptrs.emplace_back(
-                cuda[i].mutable_data(cuda_pinned_place, cpu[i].type()));
+            cuda_pinned_ptrs[i] =
+                cuda[i].mutable_data(cuda_pinned_place, cpu[i].type());
             auto size =
                 cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
 
             memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i],
                          BOOST_GET_CONST(platform::CPUPlace, cpu[i].place()),
                          cpu[i].data<void>(), size);
+
             cuda[i].set_lod(cpu[i].lod());
           } else {
-            // we set same place flag & use cpu[i] directly
-            is_same_place_ = true;
+            // Here the cpu[i]'s place may be CUDAPlace, CUDAPinnedPlace, or
+            // others, we don't copy the memory of it to CUDAPinnedPlace, but
+            // we should share tensor data to cuda[i]
+            cuda[i].ShareDataWith(cpu[i]);
           }
         }
       } else {
@@ -296,9 +298,9 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
     return;
   }
 
-  if (platform::is_gpu_place(place_) && !is_same_place_) {
+  if (platform::is_gpu_place(place_)) {
     *out = std::move(cuda_buffer_[i]);
-  } else if (platform::is_npu_place(place_) && !is_same_place_) {
+  } else if (platform::is_npu_place(place_)) {
     *out = std::move(npu_buffer_[i]);
   } else {
     *out = std::move(cpu_buffer_[i]);
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 9f7b0e753281eb..5b4bbc7d62cd8f 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -67,7 +67,6 @@ class BufferedReader : public framework::DecoratedReader {
   // buffer, just read async and create futures as buffer size. However, to
   // malloc tensors every time is extremely slow. Here we store all data in
   // buffers and prevent alloc every time.
-  bool is_same_place_;
   std::vector<TensorVec> cpu_buffer_;
   std::vector<TensorVec> cuda_buffer_;
   std::vector<TensorVec> npu_buffer_;
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index d6ba399439d023..934802f6a9e0e9 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -124,8 +124,10 @@ __global__ void GPUROIAlignForward(
 
     T roi_width = roi_xmax - roi_xmin;
     T roi_height = roi_ymax - roi_ymin;
-    roi_width = max(roi_width, static_cast<T>(1.));
-    roi_height = max(roi_height, static_cast<T>(1.));
+    if (!continuous_coordinate) {
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
 
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
@@ -138,7 +140,7 @@ __global__ void GPUROIAlignForward(
                              : ceil(roi_height / pooled_height);
     int roi_bin_grid_w =
         (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-    const T count = roi_bin_grid_h * roi_bin_grid_w;
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
     T output_val = 0;
     for (int iy = 0; iy < roi_bin_grid_h; iy++) {
       const T y = roi_ymin + ph * bin_size_h +
@@ -180,9 +182,10 @@ __global__ void GPUROIAlignBackward(
 
     T roi_width = roi_xmax - roi_xmin;
     T roi_height = roi_ymax - roi_ymin;
-    roi_width = max(roi_width, static_cast<T>(1.));
-    roi_height = max(roi_height, static_cast<T>(1.));
-
+    if (!continuous_coordinate) {
+      roi_width = max(roi_width, static_cast<T>(1.));
+      roi_height = max(roi_height, static_cast<T>(1.));
+    }
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h
index 46564ed4f629d8..29c9268d5241cc 100644
--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
@@ -226,8 +226,10 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
 
       T roi_width = roi_xmax - roi_xmin;
       T roi_height = roi_ymax - roi_ymin;
-      roi_width = std::max(roi_width, static_cast<T>(1.));
-      roi_height = std::max(roi_height, static_cast<T>(1.));
+      if (!aligned) {
+        roi_width = std::max(roi_width, static_cast<T>(1.));
+        roi_height = std::max(roi_height, static_cast<T>(1.));
+      }
 
       T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
       T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
@@ -239,7 +241,7 @@ class CPUROIAlignOpKernel : public framework::OpKernel<T> {
       int roi_bin_grid_w = (sampling_ratio > 0)
                                ? sampling_ratio
                                : ceil(roi_width / pooled_width);
-      const T count = roi_bin_grid_h * roi_bin_grid_w;
+      const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);
       Tensor pre_pos;
       Tensor pre_w;
       int pre_size = count * out_stride[1];
@@ -362,6 +364,10 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
       T roi_height = roi_ymax - roi_ymin;
       roi_width = std::max(roi_width, static_cast<T>(1.));
       roi_height = std::max(roi_height, static_cast<T>(1.));
+      if (!aligned) {
+        roi_width = std::max(roi_width, static_cast<T>(1.));
+        roi_height = std::max(roi_height, static_cast<T>(1.));
+      }
 
       T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
       T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc
index 2d599716443901..69b2c5b7380073 100644
--- a/paddle/fluid/operators/run_program_op.cc
+++ b/paddle/fluid/operators/run_program_op.cc
@@ -83,6 +83,13 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker {
               "contains at most one scope."
               "NOTE: Do not use Scope directly because Scope output is not "
               "currently supported.");
+    AddOutput("DOut",
+              "(vector<LoDTensor>)"
+              "The output tensors for GRAD Tensors in RunProgram forward "
+              "operator, the forward operator contains GRAD Tensors when it "
+              "computes double grad.")
+        .AsDuplicable()
+        .AsDispensable();
     AddAttr<BlockDesc*>("global_block",
                         "(BlockDesc *)"
                         "The global block of executed program desc.");
@@ -154,6 +161,7 @@ class RunProgramGradOpMaker : public framework::SingleGradOpMaker<T> {
     grad_op->SetInput("Params", this->Input("Params"));
     grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     grad_op->SetInput("OutScope", this->Output("OutScope"));
+    grad_op->SetInput("DOut", this->Output("DOut"));
     grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
     grad_op->SetOutput(framework::GradVarName("Params"),
                        this->InputGrad("Params"));
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index f78f5c5b948c63..c7aeb0e145e4cb 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -131,6 +131,9 @@ static void ShareVarsIntoScope(const std::vector<Variable *> &vars,
                                const std::vector<std::string> &var_names,
                                framework::Scope *scope) {
   for (size_t i = 0; i < vars.size(); ++i) {
+    if (var_names[i] == "Fake_var") {
+      continue;
+    }
     auto *var = scope->Var(var_names[i]);
     CheckInputVarStatus(*vars[i], var_names[i]);
     VariableShare(*vars[i], var);
@@ -141,9 +144,9 @@ static void ShareVarsFromScope(const std::vector<Variable *> &vars,
                                const std::vector<std::string> &var_names,
                                framework::Scope *scope) {
   for (size_t i = 0; i < vars.size(); ++i) {
-    if (var_names[i] == framework::kEmptyVarName) {
-      VLOG(2) << "find variable name is " << framework::kEmptyVarName
-              << ", skip it!";
+    if (var_names[i] == framework::kEmptyVarName ||
+        var_names[i] == "Fake_var") {
+      VLOG(2) << "find variable name is " << var_names[i] << ", skip it!";
       continue;
     }
     // NOTE: Here skip not found var is dangerous, if a bug is caused here,
@@ -170,9 +173,11 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
     auto &input_vars = ctx.MultiInputVar("X");
     auto &param_vars = ctx.MultiInputVar("Params");
     auto output_vars = ctx.MultiOutputVar("Out");
+    auto dout_vars = ctx.MultiOutputVar("DOut");
 
     auto input_var_names = ctx.InputNames("X");
     auto output_var_names = ctx.OutputNames("Out");
+    auto dout_var_names = ctx.OutputNames("DOut");
 
     // current program may not hold parameters
     std::vector<std::string> param_names;
@@ -195,7 +200,7 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
     // Step 2. prepare executor and init persistable variables
     framework::Executor exe(ctx.GetPlace());
     auto exe_ctx = framework::GetExecutorInfoFromCache(
-        exe, ctx, {output_var_names}, /*is_grad=*/false);
+        exe, ctx, {output_var_names, dout_var_names}, /*is_grad=*/false);
 
     // NOTE(Aurelius84): While training some models, forward can be called many
     // times and then apply backpropagation all at once, such as Reinforcement
@@ -219,6 +224,7 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
 
     // Step 4. Get Output
     details::ShareVarsFromScope(output_vars, output_var_names, &scope);
+    details::ShareVarsFromScope(dout_vars, dout_var_names, &scope);
 
     // Debug info: scope info when run end
     VLOG(3) << framework::GenScopeTreeDebugInfo(out_scope_vec->front());
diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc
index c83726180baeae..f94fce66806eee 100644
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -54,4 +54,6 @@ TEST(scatter, ScatterUpdate) {
     EXPECT_EQ(output.data<float>()[i], static_cast<float>(i - 4));
   for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], 0.0f);
   for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output.data<float>()[i], 0.0f);
+
+  delete cpu_place;
 }
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index eca51147f8159e..c7b61333cdab3d 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -23,6 +23,7 @@
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/assign_value_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/slice_utils.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -59,106 +60,6 @@ inline std::string GetValueName(framework::proto::VarType::Type data_type) {
   return value_name;
 }
 
-inline void CheckAndUpdateSlice(const framework::DDim in_dims,
-                                const std::vector<int64_t> axes,
-                                std::vector<int64_t>* starts,
-                                std::vector<int64_t>* ends,
-                                std::vector<int64_t>* steps) {
-  for (size_t i = 0; i < axes.size(); ++i) {
-    int64_t axis = axes[i];
-    int64_t dim_value = in_dims[axis];
-
-    int64_t start =
-        (*starts)[i] < 0 ? ((*starts)[i] + dim_value) : (*starts)[i];
-    int64_t end = (*ends)[i] < 0 ? ((*ends)[i] + dim_value) : (*ends)[i];
-    start = std::max(start, static_cast<int64_t>(0));
-    end = std::min(end, dim_value);
-
-    int64_t step = (*steps)[i];
-    PADDLE_ENFORCE_NE(
-        step, 0, platform::errors::InvalidArgument(
-                     "Step should not be 0, but received step = %d.", step));
-    if (step > 0) {
-      start = std::min(start, dim_value);
-      end = std::max(end, static_cast<int64_t>(0));
-      PADDLE_ENFORCE_GT(
-          end, start,
-          platform::errors::InvalidArgument(
-              "When step > 0, end should be greater than start, but "
-              "received end = %d, start = %d.",
-              end, start));
-    } else {
-      // NOTE(liym27): When step < 0, start should less and equal to dim_value-1
-      // "end is -1" means contain the 0-th element of this axis.
-      start = std::min(start, dim_value - 1);
-      end = std::max(end, static_cast<int64_t>(-1));
-      PADDLE_ENFORCE_GT(
-          start, end,
-          platform::errors::InvalidArgument(
-              "When step < 0, start should be greater than end, but "
-              "received start = %d, end = %d.",
-              start, end));
-    }
-
-    (*starts)[i] = start;
-    (*ends)[i] = end;
-  }
-}
-
-inline framework::DDim GetSliceDims(const framework::DDim in_dims,
-                                    const std::vector<int64_t>& axes,
-                                    const std::vector<int64_t>& starts,
-                                    const std::vector<int64_t>& ends,
-                                    const std::vector<int64_t>& steps) {
-  framework::DDim slice_dims(in_dims);
-
-  for (size_t i = 0; i < axes.size(); ++i) {
-    int64_t axis = axes[i];
-    int64_t start = starts[i];
-    int64_t end = ends[i];
-    int64_t step = steps[i];
-
-    if (step > 0) {
-      slice_dims[axis] = (end - start + step - 1) / step;
-    } else {
-      slice_dims[axis] = (end - start + step + 1) / step;
-    }
-  }
-  return slice_dims;
-}
-
-inline framework::DDim GetDecreasedDims(
-    const framework::DDim slice_dims,
-    const std::vector<int64_t>& decrease_axes) {
-  // Get dims after decreasing axes.
-  framework::DDim decreased_dims(slice_dims);
-  if (decrease_axes.size() > 0) {
-    for (size_t i = 0; i < decrease_axes.size(); ++i) {
-      int64_t axis = decrease_axes[i];
-      PADDLE_ENFORCE_EQ(
-          decreased_dims[axis], 1,
-          platform::errors::InvalidArgument("decrease dim should be 1"));
-      decreased_dims[axis] = 0;
-    }
-
-    std::vector<int64_t> new_shape;
-    for (int i = 0; i < decreased_dims.size(); ++i) {
-      if (decreased_dims[i] != 0) {
-        new_shape.push_back(decreased_dims[i]);
-      }
-    }
-
-    // NOTE(liym27): Paddle does not support that the rank of Tensor is 0, and
-    // uses [1] instead.
-    if (new_shape.size() == 0) {
-      new_shape.push_back(1);
-    }
-
-    decreased_dims = framework::make_ddim(new_shape);
-  }
-  return decreased_dims;
-}
-
 template <typename DeviceContext, typename T>
 class SetValueKernel : public framework::OpKernel<T> {
  public:
@@ -225,8 +126,8 @@ class SetValueKernel : public framework::OpKernel<T> {
     }
 
     auto in_dims = in->dims();
-    CheckAndUpdateSlice(in_dims, axes, &starts, &ends, &steps);
-    auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, steps);
+    CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps);
+    auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps);
     auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes);
 
     auto place = ctx.GetPlace();
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index b5298979721642..01daba7c072845 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -28,13 +28,10 @@ class SliceOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true,
-                      platform::errors::InvalidArgument(
-                          "Input (Input) of slice op should not be null."));
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "slice");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "slice");
 
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      platform::errors::InvalidArgument(
-                          "Output (Out) of slice op should not be null."));
+    // Case 1: Special treatment when input is a tensor array.
     auto x_var_type = ctx->GetInputsVarType("Input")[0];
     auto axes = ctx->Attrs().Get<std::vector<int>>("axes");
     if (x_var_type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
@@ -57,6 +54,8 @@ class SliceOp : public framework::OperatorWithKernel {
         return;
       }
     }
+
+    // Case 2: input is a tensor.
     auto in_dims = ctx->GetInputDim("Input");
     PADDLE_ENFORCE_LT(in_dims.size(), 7,
                       platform::errors::InvalidArgument(
@@ -65,101 +64,54 @@ class SliceOp : public framework::OperatorWithKernel {
 
     auto starts = ctx->Attrs().Get<std::vector<int>>("starts");
     auto ends = ctx->Attrs().Get<std::vector<int>>("ends");
-    auto infer_flags = ctx->Attrs().Get<std::vector<int>>("infer_flags");
     auto decrease_axis = ctx->Attrs().Get<std::vector<int>>("decrease_axis");
-
-    auto starts_size = starts.size();
-    auto ends_size = ends.size();
+    auto infer_flags = ctx->Attrs().Get<std::vector<int>>("infer_flags");
     if (infer_flags.empty()) {
       // Initialize infer_flags with 1.
       // To be compatible with other op tests in which infer_flags is not set.
       infer_flags = std::vector<int>(axes.size(), 1);
     }
 
+    // 2.1 Check attrs.
+    auto starts_size = starts.size();
+    auto ends_size = ends.size();
+
     if (ctx->HasInputs("StartsTensorList")) {
-      auto StartsTensorList = ctx->Inputs("StartsTensorList");
-      PADDLE_ENFORCE_GT(StartsTensorList.size(), 0,
+      starts_size = ctx->Inputs("StartsTensorList").size();
+      PADDLE_ENFORCE_GT(starts_size, 0,
                         platform::errors::InvalidArgument(
                             "StartsTensorList size can't be zero"));
-      starts_size = StartsTensorList.size();
     }
     if (ctx->HasInputs("EndsTensorList")) {
-      auto EndsTensorList = ctx->Inputs("EndsTensorList");
-      PADDLE_ENFORCE_GT(EndsTensorList.size(), 0,
-                        platform::errors::InvalidArgument(
-                            "EndsTensorList size can't be zero"));
-      ends_size = EndsTensorList.size();
+      ends_size = ctx->Inputs("EndsTensorList").size();
+      PADDLE_ENFORCE_GT(ends_size, 0, platform::errors::InvalidArgument(
+                                          "EndsTensorList size can't be zero"));
     }
 
-    if (ctx->HasInput("StartsTensor") == false) {
+    if (!ctx->HasInput("StartsTensor")) {
       PADDLE_ENFORCE_EQ(
           starts_size, axes.size(),
           platform::errors::InvalidArgument(
               "The size of starts must be equal to the size of axes."));
     }
-    if (ctx->HasInput("EndsTensor") == false) {
+    if (!ctx->HasInput("EndsTensor")) {
       PADDLE_ENFORCE_EQ(
           ends_size, axes.size(),
           platform::errors::InvalidArgument(
               "The size of ends must be equal to the size of axes."));
     }
 
-    int dim_value, start, end;
-    for (size_t i = 0; i < axes.size(); ++i) {
-      PADDLE_ENFORCE_LT(static_cast<int>(axes[i]), in_dims.size(),
-                        platform::errors::InvalidArgument(
-                            "The index of dimension in axes must be less "
-                            "than the size of input shape."));
-      if (infer_flags[i] == -1) {
-        out_dims[axes[i]] = -1;
-      } else {
-        // infer out_dim shape
-        dim_value = out_dims[axes[i]];
-        if (dim_value > 0) {
-          start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
-          end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-          start = std::max(start, 0);
-          end = std::max(end, 0);
-          end = std::min(end, dim_value);
-
-          PADDLE_ENFORCE_LE(start, dim_value,
-                            platform::errors::InvalidArgument(
-                                "start should be less than or equal to the "
-                                "dimension value, but received "
-                                "start = %d, shape[%d] = %d.",
-                                starts[i], axes[i], out_dims[axes[i]]));
-          PADDLE_ENFORCE_GT(end, start,
-                            platform::errors::InvalidArgument(
-                                "end should greater than start, but received "
-                                "end = %d, start = %d.",
-                                ends[i], starts[i]));
-          out_dims[axes[i]] = end - start;
-        }
-      }
-    }
-    // generate new shape
-    if (decrease_axis.size() > 0) {
-      std::vector<int> new_out_shape;
-      for (size_t i = 0; i < decrease_axis.size(); ++i) {
-        if (ctx->IsRuntime() && infer_flags[i] != -1) {
-          PADDLE_ENFORCE_EQ(
-              out_dims[decrease_axis[i]], 1,
-              platform::errors::InvalidArgument("decrease dim should be 1"));
-        }
-        out_dims[decrease_axis[i]] = 0;
-      }
+    CheckAndUpdateSliceAttrs<int>(in_dims, axes, &starts, &ends, nullptr,
+                                  &infer_flags);
 
-      for (int i = 0; i < out_dims.size(); ++i) {
-        if (out_dims[i] != 0) {
-          new_out_shape.push_back(out_dims[i]);
-        }
-      }
-      if (new_out_shape.size() == 0) {
-        new_out_shape.push_back(1);
-      }
-
-      out_dims = framework::make_ddim(new_out_shape);
+    auto slice_dims =
+        GetSliceDims<int>(in_dims, axes, starts, ends, nullptr, &infer_flags);
+    if (ctx->IsRuntime()) {
+      out_dims = GetDecreasedDims<int>(slice_dims, decrease_axis, &infer_flags);
+    } else {
+      out_dims = GetDecreasedDims<int>(slice_dims, decrease_axis, nullptr);
     }
+
     ctx->SetOutputDim("Out", out_dims);
     if (axes[0] != 0) {
       ctx->ShareLoD("Input", /*->*/ "Out");
@@ -185,6 +137,7 @@ class SliceOp : public framework::OperatorWithKernel {
     return framework::OpKernelType(
         OperatorWithKernel::IndicateVarDataType(ctx, "Input"), ctx.GetPlace());
   }
+
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name, const Tensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
index 3d294ae238986c..96b8ea11d6845e 100644
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -19,21 +19,67 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/slice_utils.h"
 #include "paddle/fluid/operators/utils.h"
 
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
+using Variable = framework::Variable;
+using LoDTensorArray = framework::LoDTensorArray;
+using DDim = framework::DDim;
+
+inline void DealTensorArray(const framework::ExecutionContext& ctx,
+                            const std::vector<int64_t>& starts,
+                            const std::vector<int64_t>& ends,
+                            bool out_is_array) {
+  auto in_array = ctx.Input<LoDTensorArray>("Input");
+  // If the input is LoDTensorArray, the rank of input is 1.
+  int64_t in_size = in_array->size();
+  int64_t start = starts[0] < 0 ? (starts[0] + in_size) : starts[0];
+  int64_t end = ends[0] < 0 ? (ends[0] + in_size) : ends[0];
+
+  start = std::max(start, static_cast<int64_t>(0));
+  end = std::max(end, static_cast<int64_t>(0));
+  end = std::min(end, in_size);
+
+  PADDLE_ENFORCE_GT(end, start,
+                    platform::errors::InvalidArgument(
+                        "Attr(ends) should be greater than attr(starts) in "
+                        "slice op. But received end = %d, start = %d.",
+                        ends[0], starts[0]));
+  int64_t out_size = end - start;
+
+  if (out_is_array) {
+    auto out_array = ctx.Output<LoDTensorArray>("Out");
+    out_array->resize(out_size);
+
+    for (int i = 0; i < out_size; ++i) {
+      auto* out_tensor = &out_array->at(i);
+      auto in_tensor = in_array->at(i + start);
+      out_tensor->set_lod(in_tensor.lod());
+      if (in_tensor.memory_size() > 0) {
+        TensorCopy(in_tensor, ctx.GetPlace(), out_tensor);
+      } else {
+        VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
+                    "nothing has been written to output array["
+                 << i << "].";
+      }
+    }
+  } else {
+    auto out = ctx.Output<Tensor>("Out");
+    auto in_tensor = in_array->at(start);
+    TensorCopy(in_tensor, ctx.GetPlace(), out);
+  }
+}
 
 template <typename DeviceContext, typename T>
 class SliceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Variable* input_var = ctx.InputVar("Input");
-    bool is_tensor_array = input_var->IsType<framework::LoDTensorArray>();
-    int rank = is_tensor_array
-                   ? 1
-                   : ctx.Input<framework::Tensor>("Input")->dims().size();
+    const Variable* input_var = ctx.InputVar("Input");
+    bool is_tensor_array = input_var->IsType<LoDTensorArray>();
+    int rank = is_tensor_array ? 1 : ctx.Input<Tensor>("Input")->dims().size();
 
     switch (rank) {
       case 1:
@@ -54,53 +100,45 @@ class SliceKernel : public framework::OpKernel<T> {
       case 6:
         SliceCompute<6>(ctx);
         break;
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "The rank of input should be less than 7, but received %d.", rank));
     }
   }
 
  private:
   template <size_t D>
-  void SliceCompute(const framework::ExecutionContext& context) const {
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    const framework::Variable* input_var = context.InputVar("Input");
-    framework::Variable* out_var = context.OutputVar("Out");
-    bool input_is_tensor_array = input_var->IsType<framework::LoDTensorArray>();
-    bool out_is_tensor_array = out_var->IsType<framework::LoDTensorArray>();
-
-    auto axes = context.Attr<std::vector<int>>("axes");
-
-    auto starts_int = context.Attr<std::vector<int>>("starts");
+  void SliceCompute(const framework::ExecutionContext& ctx) const {
+    const Variable* input_var = ctx.InputVar("Input");
+    Variable* out_var = ctx.OutputVar("Out");
+    bool input_is_array = input_var->IsType<LoDTensorArray>();
+    bool out_is_array = out_var->IsType<LoDTensorArray>();
+
+    auto axes_int = ctx.Attr<std::vector<int>>("axes");
+    auto starts_int = ctx.Attr<std::vector<int>>("starts");
+    auto ends_int = ctx.Attr<std::vector<int>>("ends");
+    std::vector<int64_t> axes(axes_int.begin(), axes_int.end());
     std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
-    auto ends_int = context.Attr<std::vector<int>>("ends");
     std::vector<int64_t> ends(ends_int.begin(), ends_int.end());
-    auto decrease_axis = context.Attr<std::vector<int>>("decrease_axis");
-    auto infer_flags = context.Attr<std::vector<int>>("infer_flags");
-    auto list_new_ends_tensor =
-        context.MultiInput<framework::Tensor>("EndsTensorList");
-    auto list_new_starts_tensor =
-        context.MultiInput<framework::Tensor>("StartsTensorList");
-
-    bool need_infer = false;
-    if (context.HasInput("StartsTensor") || context.HasInput("EndsTensor")) {
-      need_infer = true;
-    }
-    if (list_new_starts_tensor.size() > 0 || list_new_ends_tensor.size() > 0) {
-      need_infer = true;
+
+    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
+    auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
+
+    // Step 1: Get the accurate attribute value of starts and ends
+    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    if (ctx.HasInput("StartsTensor")) {
+      starts = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("StartsTensor"));
+    } else if (starts_tensor_list.size() > 0) {
+      starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
     }
-    if (need_infer) {
-      if (context.HasInput("StartsTensor")) {
-        auto* starts_tensor = context.Input<framework::Tensor>("StartsTensor");
-        starts = GetDataFromTensor<int64_t>(starts_tensor);
-      } else if (list_new_starts_tensor.size() > 0) {
-        starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
-      }
-      if (context.HasInput("EndsTensor")) {
-        auto* ends_tensor = context.Input<framework::Tensor>("EndsTensor");
-        ends = GetDataFromTensor<int64_t>(ends_tensor);
-      } else if (list_new_ends_tensor.size() > 0) {
-        ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
-      }
+
+    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    if (ctx.HasInput("EndsTensor")) {
+      ends = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("EndsTensor"));
+    } else if (ends_tensor_list.size() > 0) {
+      ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
     }
+
     PADDLE_ENFORCE_EQ(
         starts.size(), axes.size(),
         platform::errors::InvalidArgument(
@@ -109,175 +147,74 @@ class SliceKernel : public framework::OpKernel<T> {
         ends.size(), axes.size(),
         platform::errors::InvalidArgument(
             "The size of ends must be equal to the size of axes."));
-    if (input_is_tensor_array) {
-      auto in_array = context.Input<framework::LoDTensorArray>("Input");
-      // If the input is LoDTensorArray, the rank of input is 1.
-      int64_t in_size = in_array->size();
-      int64_t start = starts[0] < 0 ? (starts[0] + in_size) : starts[0];
-      int64_t end = ends[0] < 0 ? (ends[0] + in_size) : ends[0];
-
-      start = std::max(start, static_cast<int64_t>(0));
-      end = std::max(end, static_cast<int64_t>(0));
-      end = std::min(end, in_size);
-
-      PADDLE_ENFORCE_GT(end, start,
-                        platform::errors::InvalidArgument(
-                            "Attr(ends) should be greater than attr(starts) in "
-                            "slice op. But received end = %d, start = %d.",
-                            ends[0], starts[0]));
-      int64_t out_size = end - start;
-
-      if (out_is_tensor_array) {
-        auto out_array = context.Output<framework::LoDTensorArray>("Out");
-        out_array->resize(out_size);
-
-        for (int i = 0; i < out_size; ++i) {
-          auto* out_tensor = &out_array->at(i);
-          auto in_tensor = in_array->at(i + start);
-          out_tensor->set_lod(in_tensor.lod());
-          if (in_tensor.memory_size() > 0) {
-            TensorCopy(in_tensor, context.GetPlace(), out_tensor);
-          } else {
-            VLOG(10)
-                << "WARNING: The input tensor 'x_tensor' holds no memory, so "
-                   "nothing has been written to output array["
-                << i << "].";
-          }
-        }
-      } else {
-        auto out = context.Output<framework::Tensor>("Out");
-        auto in_tensor = in_array->at(start);
-        TensorCopy(in_tensor, context.GetPlace(), out);
-      }
 
+    // Step 2: Compute output
+    if (input_is_array) {
+      DealTensorArray(ctx, starts, ends, out_is_array);
       return;
-    }
+    } else {
+      auto in = ctx.Input<Tensor>("Input");
+      auto out = ctx.Output<Tensor>("Out");
 
-    auto in = context.Input<framework::Tensor>("Input");
-    auto out = context.Output<framework::Tensor>("Out");
+      auto in_dims = in->dims();
+      auto out_dims = out->dims();
+      auto slice_dims = out_dims;
 
-    auto out_dims = out->dims();
-    auto in_dims = in->dims();
-    if (need_infer) {
-      out_dims = in_dims;
-      int64_t dim_value, start, end;
+      // 2.1 Infer output dims
       for (size_t i = 0; i < axes.size(); ++i) {
-        dim_value = out_dims[axes[i]];
-        if (dim_value > 0) {
-          // when end = start+1 and start == -1
-          if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
-            auto ret =
-                std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
-            if (ret != decrease_axis.end()) {
-              ends[i] = 10000000;
-            }
-          }
-
-          start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
-          end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-          start = std::max(start, static_cast<int64_t>(0));
-          end = std::max(end, static_cast<int64_t>(0));
-          end = std::min(end, dim_value);
-          PADDLE_ENFORCE_GT(
-              end, start,
-              platform::errors::InvalidArgument(
-                  "Attr(ends) should be greater than attr(starts) in "
-                  "slice op. But received end = %d, start = %d.",
-                  ends[i], starts[i]));
-          out_dims[axes[i]] = end - start;
-        }
-      }
-      out->Resize(out_dims);
-      // generate new shape
-      if (decrease_axis.size() > 0) {
-        std::vector<int64_t> new_out_shape;
-        for (size_t i = 0; i < decrease_axis.size(); ++i) {
-          PADDLE_ENFORCE_EQ(
-              out_dims[decrease_axis[i]], 1,
-              platform::errors::InvalidArgument("decrease dim should be 1"));
-          out_dims[decrease_axis[i]] = 0;
-        }
-
-        for (int i = 0; i < out_dims.size(); ++i) {
-          if (out_dims[i] != 0) {
-            new_out_shape.push_back(out_dims[i]);
+        // when start == -1 && end == start+1
+        if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
+          auto ret =
+              std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
+          if (ret != decrease_axis.end()) {
+            ends[i] = in_dims[axes[i]];
           }
         }
-        if (new_out_shape.size() == 0) {
-          new_out_shape.push_back(1);
-        }
-
-        out_dims = framework::make_ddim(new_out_shape);
       }
-    }
 
-    // resize out_dims
-    if (decrease_axis.size() > 0) {
-      if (decrease_axis.size() == (size_t)in_dims.size()) {
-        std::vector<int> vec_origin_out_shape(decrease_axis.size(), 1);
-        out->Resize(framework::make_ddim(vec_origin_out_shape));
-      } else {
-        std::vector<int> vec_origin_out_shape(
-            out_dims.size() + decrease_axis.size(), -1);
+      CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
+      slice_dims =
+          GetSliceDims<int64_t>(in_dims, axes, starts, ends, nullptr, nullptr);
+      out_dims = GetDecreasedDims(slice_dims, decrease_axis);
 
-        for (size_t i = 0; i < decrease_axis.size(); ++i) {
-          vec_origin_out_shape[decrease_axis[i]] = 1;
-        }
+      // 2.2 Get output
+      auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
+      auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
 
-        int index = 0;
-        for (size_t i = 0; i < vec_origin_out_shape.size(); ++i) {
-          if (vec_origin_out_shape[i] == -1) {
-            vec_origin_out_shape[i] = out_dims[index];
-            ++index;
-          }
-        }
-
-        out->Resize(framework::make_ddim(vec_origin_out_shape));
+      for (size_t i = 0; i < D; ++i) {
+        offsets[i] = 0;
+        extents[i] = slice_dims[i];
       }
-    }
-
-    out->mutable_data<T>(context.GetPlace());
-
-    auto new_out_dims = out->dims();
-    auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
-    auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
-    for (size_t i = 0; i < D; ++i) {
-      offsets[i] = 0;
-      extents[i] = new_out_dims[i];
-    }
-    int64_t start;
-    for (size_t i = 0; i < axes.size(); ++i) {
-      start = starts[i];
-      if (start < 0) {
-        start = (start + in_dims[axes[i]]);
+      for (size_t i = 0; i < axes.size(); ++i) {
+        offsets[axes[i]] = starts[i];
       }
-      start = std::max(start, static_cast<int64_t>(0));
-      offsets[axes[i]] = start;
-    }
-    auto in_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *in);
-    auto out_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *out, new_out_dims);
 
-    if (in->numel() <= Eigen::NumTraits<int>::highest()) {
-      // similar to tf.slice:
-      // if element number less than INT_MAX, change the type of index to int
-      Eigen::DSizes<int, D> offsets_32bit, extents_32bit;
-      for (size_t i = 0; i < D; i++) {
-        offsets_32bit[i] = offsets[i];
-        extents_32bit[i] = extents[i];
+      out->Resize(slice_dims);
+      out->mutable_data<T>(ctx.GetPlace());
+
+      auto in_t = framework::EigenTensor<T, D>::From(*in, in_dims);
+      auto out_t = framework::EigenTensor<T, D>::From(*out, slice_dims);
+      auto& eigen_place =
+          *ctx.template device_context<DeviceContext>().eigen_device();
+
+      if (in->numel() <= Eigen::NumTraits<int>::highest()) {
+        // similar to tf.slice:
+        // if element number less than INT_MAX, change the type of index to int
+        Eigen::DSizes<int, D> offsets_32bit, extents_32bit;
+        for (size_t i = 0; i < D; i++) {
+          offsets_32bit[i] = offsets[i];
+          extents_32bit[i] = extents[i];
+        }
+        EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
+            eigen_place, framework::To32BitIndex(out_t),
+            framework::To32BitIndex(in_t), offsets_32bit, extents_32bit);
+      } else {
+        EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
+            eigen_place, out_t, in_t, offsets, extents);
       }
-      EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
-          place, framework::To32BitIndex(out_t), framework::To32BitIndex(in_t),
-          offsets_32bit, extents_32bit);
-    } else {
-      EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(place, out_t, in_t,
-                                                            offsets, extents);
-    }
 
-    out->Resize(out_dims);
+      out->Resize(out_dims);
+    }
   }
 };
 
@@ -285,11 +222,9 @@ template <typename DeviceContext, typename T>
 class SliceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Variable* input_var = ctx.InputVar("Input");
-    bool is_tensor_array = input_var->IsType<framework::LoDTensorArray>();
-    size_t rank = is_tensor_array
-                      ? 1
-                      : ctx.Input<framework::Tensor>("Input")->dims().size();
+    const Variable* input_var = ctx.InputVar("Input");
+    bool is_array = input_var->IsType<LoDTensorArray>();
+    size_t rank = is_array ? 1 : ctx.Input<Tensor>("Input")->dims().size();
 
     switch (rank) {
       case 1:
@@ -310,53 +245,48 @@ class SliceGradKernel : public framework::OpKernel<T> {
       case 6:
         SliceCompute<6>(ctx);
         break;
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "The rank of input should be less than 7, but received %d.", rank));
     }
   }
 
  private:
   template <size_t D>
-  void SliceCompute(const framework::ExecutionContext& context) const {
-    auto axes = context.Attr<std::vector<int>>("axes");
-
-    auto starts_int = context.Attr<std::vector<int>>("starts");
+  void SliceCompute(const framework::ExecutionContext& ctx) const {
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts_int = ctx.Attr<std::vector<int>>("starts");
+    auto ends_int = ctx.Attr<std::vector<int>>("ends");
     std::vector<int64_t> starts(starts_int.begin(), starts_int.end());
-
-    auto ends_int = context.Attr<std::vector<int>>("ends");
     std::vector<int64_t> ends(ends_int.begin(), ends_int.end());
 
-    auto list_new_ends_tensor =
-        context.MultiInput<framework::Tensor>("EndsTensorList");
-    auto list_new_starts_tensor =
-        context.MultiInput<framework::Tensor>("StartsTensorList");
-
-    if (list_new_starts_tensor.size() > 0) {
-      starts = GetDataFromTensorList<int64_t>(list_new_starts_tensor);
-    } else if (context.HasInput("StartsTensor")) {
-      auto* starts_tensor = context.Input<framework::Tensor>("StartsTensor");
-      starts = GetDataFromTensor<int64_t>(starts_tensor);
+    // Get the accurate attribute value of starts and ends
+    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    if (ctx.HasInput("StartsTensor")) {
+      starts = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("StartsTensor"));
+    } else if (starts_tensor_list.size() > 0) {
+      starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
     }
 
-    if (list_new_ends_tensor.size() > 0) {
-      ends = GetDataFromTensorList<int64_t>(list_new_ends_tensor);
-    } else if (context.HasInput("EndsTensor")) {
-      auto* ends_tensor = context.Input<framework::Tensor>("EndsTensor");
-      ends = GetDataFromTensor<int64_t>(ends_tensor);
+    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    if (ctx.HasInput("EndsTensor")) {
+      ends = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("EndsTensor"));
+    } else if (ends_tensor_list.size() > 0) {
+      ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
     }
-    framework::Variable* d_input_var =
-        context.OutputVar(framework::GradVarName("Input"));
-    const framework::Variable* d_out_var =
-        context.InputVar(framework::GradVarName("Out"));
-    bool d_input_is_tensor_array =
-        d_input_var->IsType<framework::LoDTensorArray>();
-    bool d_out_is_tensor_array = d_out_var->IsType<framework::LoDTensorArray>();
-
-    if (d_input_is_tensor_array) {
-      auto* input_array = context.Input<framework::LoDTensorArray>("Input");
-      auto* d_input_array = context.Output<framework::LoDTensorArray>(
-          framework::GradVarName("Input"));
+
+    Variable* d_input_var = ctx.OutputVar(framework::GradVarName("Input"));
+    const Variable* d_out_var = ctx.InputVar(framework::GradVarName("Out"));
+    bool d_input_is_array = d_input_var->IsType<LoDTensorArray>();
+    bool d_out_is_array = d_out_var->IsType<LoDTensorArray>();
+
+    if (d_input_is_array) {
+      auto* input_array = ctx.Input<LoDTensorArray>("Input");
+      auto* d_in_arr =
+          ctx.Output<LoDTensorArray>(framework::GradVarName("Input"));
 
       int64_t d_in_size = input_array->size();
-      d_input_array->resize(d_in_size);
+      d_in_arr->resize(d_in_size);
       // If the input is LoDTensorArray, the rank of input is 1.
       // So only use the 0th element of starts.
       int64_t start = starts[0] < 0 ? (starts[0] + d_in_size) : starts[0];
@@ -364,68 +294,60 @@ class SliceGradKernel : public framework::OpKernel<T> {
       // set zero
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
-      auto& dev_ctx = *pool.Get(context.GetPlace());
-      T value = T(0);
+      auto& dev_ctx = *pool.Get(ctx.GetPlace());
       math::SetConstant<DeviceContext, T> functor;
       for (int i = 0; i < d_in_size; ++i) {
         auto dim = input_array->at(i).dims();
-        d_input_array->at(i).Resize(dim);
-        d_input_array->at(i).mutable_data<T>(context.GetPlace());
+        d_in_arr->at(i).Resize(dim);
+        d_in_arr->at(i).mutable_data<T>(ctx.GetPlace());
         functor(reinterpret_cast<const DeviceContext&>(dev_ctx),
-                &d_input_array->at(i), static_cast<T>(value));
+                &d_in_arr->at(i), static_cast<T>(0));
       }
 
-      if (d_out_is_tensor_array) {
-        auto* d_out_array = context.Input<framework::LoDTensorArray>(
-            framework::GradVarName("Out"));
-        int d_out_size = d_out_array->size();
+      if (d_out_is_array) {
+        auto* d_out_arr =
+            ctx.Input<LoDTensorArray>(framework::GradVarName("Out"));
+        int d_out_size = d_out_arr->size();
         for (int i = 0; i < d_out_size; ++i) {
-          TensorCopy(d_out_array->at(i), context.GetPlace(),
-                     &(d_input_array->at(start + i)));
+          TensorCopy(d_out_arr->at(i), ctx.GetPlace(),
+                     &(d_in_arr->at(start + i)));
         }
-
       } else {
-        auto* d_out =
-            context.Input<framework::Tensor>(framework::GradVarName("Out"));
-        TensorCopy(*d_out, context.GetPlace(), &(d_input_array->at(start)));
+        auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+        TensorCopy(*d_out, ctx.GetPlace(), &(d_in_arr->at(start)));
       }
       return;
     }
 
-    auto* d_out =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-
-    auto* d_input =
-        context.Output<framework::Tensor>(framework::GradVarName("Input"));
-
-    d_input->mutable_data<T>(context.GetPlace());
+    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_input = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    d_input->mutable_data<T>(ctx.GetPlace());
 
     auto out_dims = d_out->dims();
     auto in_dims = d_input->dims();
 
-    auto decrease_axis = context.Attr<std::vector<int>>("decrease_axis");
-    if (decrease_axis.size() > 0) {
-      if (decrease_axis.size() == (size_t)in_dims.size()) {
+    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
+    auto decrease_size = decrease_axis.size();
+    if (decrease_size > 0) {
+      if (decrease_size == (size_t)in_dims.size()) {
         // all dims decrease
-        std::vector<int> vec_origin_out_shape(decrease_axis.size(), 1);
-        out_dims = framework::make_ddim(vec_origin_out_shape);
+        std::vector<int> origin_out_shape(decrease_size, 1);
+        out_dims = framework::make_ddim(std::vector<int>(decrease_size, 1));
       } else {
-        std::vector<int> vec_origin_out_shape(
-            out_dims.size() + decrease_axis.size(), -1);
-
-        for (size_t i = 0; i < decrease_axis.size(); ++i) {
-          vec_origin_out_shape[decrease_axis[i]] = 1;
+        std::vector<int> origin_out_shape(out_dims.size() + decrease_size, -1);
+        for (size_t i = 0; i < decrease_size; ++i) {
+          origin_out_shape[decrease_axis[i]] = 1;
         }
 
         int index = 0;
-        for (size_t i = 0; i < vec_origin_out_shape.size(); ++i) {
-          if (vec_origin_out_shape[i] == -1) {
-            vec_origin_out_shape[i] = out_dims[index];
+        for (size_t i = 0; i < origin_out_shape.size(); ++i) {
+          if (origin_out_shape[i] == -1) {
+            origin_out_shape[i] = out_dims[index];
             ++index;
           }
         }
 
-        out_dims = framework::make_ddim(vec_origin_out_shape);
+        out_dims = framework::make_ddim(origin_out_shape);
       }
     }
 
@@ -435,28 +357,26 @@ class SliceGradKernel : public framework::OpKernel<T> {
       offsets[i] = 0;
       extents[i] = out_dims[i];
     }
-    int64_t start;
+
     for (size_t i = 0; i < axes.size(); ++i) {
-      start = starts[i];
-      if (start < 0) {
-        start = (start + in_dims[axes[i]]);
-      }
+      int axis = axes[i];
+      int64_t start = starts[i] < 0 ? (starts[i] + in_dims[axis]) : starts[i];
       start = std::max(start, static_cast<int64_t>(0));
-      offsets[axes[i]] = start;
+      offsets[axis] = start;
     }
+
     Eigen::array<std::pair<int64_t, int64_t>, D> paddings;
     for (size_t i = 0; i < paddings.size(); ++i) {
       paddings[i].first = offsets[i];
       paddings[i].second = (in_dims[i] - out_dims[i]) - offsets[i];
     }
-    EigenPaddingCompute(context, d_input, in_dims, d_out, out_dims, paddings);
+    EigenPaddingCompute(ctx, d_input, in_dims, d_out, out_dims, paddings);
   }
 
   template <size_t D>
   void EigenPaddingCompute(
-      const framework::ExecutionContext& context, framework::Tensor* d_input,
-      const framework::DDim& in_dims, const framework::Tensor* d_out,
-      const framework::DDim& out_dims,
+      const framework::ExecutionContext& context, Tensor* d_input,
+      const DDim& in_dims, const Tensor* d_out, const DDim& out_dims,
       const Eigen::array<std::pair<int64_t, int64_t>, D>& paddings) const {
     if (D <= 3) {
       // if dimension less than 3, cannot reduce dimension
@@ -512,10 +432,8 @@ class SliceGradKernel : public framework::OpKernel<T> {
           out_tore_shape[1] = out_dims[pad_dim];
 
           // convert array from std::vector to DDim
-          framework::DDim reshaped_in_dims =
-              framework::make_ddim(in_tore_shape);
-          framework::DDim reshaped_out_dims =
-              framework::make_ddim(out_tore_shape);
+          DDim reshaped_in_dims = framework::make_ddim(in_tore_shape);
+          DDim reshaped_out_dims = framework::make_ddim(out_tore_shape);
 
           // after reshape: the first dimension do not need padding,
           // set padding[0] zero
@@ -543,10 +461,8 @@ class SliceGradKernel : public framework::OpKernel<T> {
           }
 
           // convert array from std::vector to DDim
-          framework::DDim reshaped_in_dims =
-              framework::make_ddim(in_tore_shape);
-          framework::DDim reshaped_out_dims =
-              framework::make_ddim(out_tore_shape);
+          DDim reshaped_in_dims = framework::make_ddim(in_tore_shape);
+          DDim reshaped_out_dims = framework::make_ddim(out_tore_shape);
 
           // after reshape:
           // the first dimension is the previous padding dimension
@@ -579,10 +495,8 @@ class SliceGradKernel : public framework::OpKernel<T> {
           }
 
           // convert array from std::vector to DDim
-          framework::DDim reshaped_in_dims =
-              framework::make_ddim(in_tore_shape);
-          framework::DDim reshaped_out_dims =
-              framework::make_ddim(out_tore_shape);
+          DDim reshaped_in_dims = framework::make_ddim(in_tore_shape);
+          DDim reshaped_out_dims = framework::make_ddim(out_tore_shape);
 
           // after reshape:
           // the first dimension do not need padding, set padding[0] zero
@@ -606,9 +520,8 @@ class SliceGradKernel : public framework::OpKernel<T> {
 
   template <size_t D>
   void LaunchEigenPadding(
-      const framework::ExecutionContext& context, framework::Tensor* d_input,
-      const framework::DDim& in_dims, const framework::Tensor* d_out,
-      const framework::DDim& out_dims,
+      const framework::ExecutionContext& context, Tensor* d_input,
+      const DDim& in_dims, const Tensor* d_out, const DDim& out_dims,
       const Eigen::array<std::pair<int64_t, int64_t>, D>& paddings) const {
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
diff --git a/paddle/fluid/operators/slice_utils.h b/paddle/fluid/operators/slice_utils.h
new file mode 100644
index 00000000000000..60782a9a9248f8
--- /dev/null
+++ b/paddle/fluid/operators/slice_utils.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <paddle/fluid/framework/operator.h>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename T = int64_t>
+inline void CheckAndUpdateSliceAttrs(const framework::DDim in_dims,
+                                     const std::vector<T>& axes,
+                                     std::vector<T>* starts,
+                                     std::vector<T>* ends,
+                                     std::vector<int64_t>* steps = nullptr,
+                                     std::vector<T>* infer_flags = nullptr) {
+  for (size_t i = 0; i < axes.size(); ++i) {
+    T axis = axes[i];
+    T dim_value = in_dims[axis];
+
+    if (dim_value > 0) {
+      if (infer_flags != nullptr && (*infer_flags)[i] == -1) {
+        continue;
+      }
+      T start = (*starts)[i] < 0 ? ((*starts)[i] + dim_value) : (*starts)[i];
+      start = std::max(start, static_cast<T>(0));
+
+      T end = (*ends)[i] < 0 ? ((*ends)[i] + dim_value) : (*ends)[i];
+      end = std::min(end, dim_value);
+
+      T step = steps == nullptr ? 1 : (*steps)[i];
+      PADDLE_ENFORCE_NE(
+          step, 0, platform::errors::InvalidArgument(
+                       "Step should not be 0, but received step = %d.", step));
+
+      if (step > 0) {
+        start = std::min(start, dim_value);
+        end = std::max(end, static_cast<T>(0));
+        PADDLE_ENFORCE_GT(
+            end, start,
+            platform::errors::InvalidArgument(
+                "When step > 0, end should be greater than start, but "
+                "received end = %d, start = %d.",
+                end, start));
+      } else {
+        // NOTE(liym27): When step < 0, start should less and equal to
+        // dim_value-1
+        // "end is -1" means contain the 0-th element of this axis.
+        start = std::min(start, dim_value - 1);
+        end = std::max(end, static_cast<T>(-1));
+        PADDLE_ENFORCE_GT(
+            start, end,
+            platform::errors::InvalidArgument(
+                "When step < 0, start should be greater than end, but "
+                "received start = %d, end = %d.",
+                start, end));
+      }
+
+      (*starts)[i] = start;
+      (*ends)[i] = end;
+    }
+  }
+}
+
+template <typename T = int64_t>
+inline framework::DDim GetSliceDims(const framework::DDim in_dims,
+                                    const std::vector<T>& axes,
+                                    const std::vector<T>& starts,
+                                    const std::vector<T>& ends,
+                                    std::vector<T>* steps = nullptr,
+                                    std::vector<T>* infer_flags = nullptr) {
+  framework::DDim slice_dims(in_dims);
+
+  for (size_t i = 0; i < axes.size(); ++i) {
+    T axis = axes[i];
+    if (infer_flags != nullptr && (*infer_flags)[i] == -1) {
+      slice_dims[axis] = -1;
+      continue;
+    }
+
+    T start = starts[i];
+    T end = ends[i];
+    T step = steps == nullptr ? 1 : (*steps)[i];
+
+    if (step > 0) {
+      slice_dims[axis] = (end - start + step - 1) / step;
+    } else {
+      slice_dims[axis] = (end - start + step + 1) / step;
+    }
+  }
+  return slice_dims;
+}
+
+template <typename T = int64_t>
+inline framework::DDim GetDecreasedDims(const framework::DDim slice_dims,
+                                        const std::vector<T>& decrease_axes,
+                                        std::vector<T>* infer_flags = nullptr) {
+  framework::DDim decreased_dims(slice_dims);
+  if (decrease_axes.size() > 0) {
+    for (size_t i = 0; i < decrease_axes.size(); ++i) {
+      T axis = decrease_axes[i];
+      if (infer_flags && (*infer_flags)[i] != -1) {
+        PADDLE_ENFORCE_EQ(
+            decreased_dims[axis], 1,
+            platform::errors::InvalidArgument("decrease dim should be 1"));
+      }
+      decreased_dims[axis] = 0;
+    }
+
+    std::vector<T> new_shape;
+    for (int i = 0; i < decreased_dims.size(); ++i) {
+      if (decreased_dims[i] != 0) {
+        new_shape.push_back(decreased_dims[i]);
+      }
+    }
+
+    // NOTE(liym27): Paddle does not support that the rank of Tensor is 0, and
+    // uses [1] instead.
+    if (new_shape.size() == 0) {
+      new_shape.push_back(1);
+    }
+
+    decreased_dims = framework::make_ddim(new_shape);
+  }
+  return decreased_dims;
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 8fe456edeabf11..4aec4c17422792 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -15,481 +15,44 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/softmax_impl.cuh"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
 #include "paddle/fluid/platform/for_range.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
 
 namespace paddle {
 namespace operators {
 
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using DataLayout = platform::DataLayout;
 using Tensor = framework::Tensor;
 
-// Wrapper of log function. Use log(float32) for float16
-template <typename T>
-static __device__ __forceinline__ T Log(T x) {
-  using AccT = typename details::MPTypeTrait<T>::Type;
-  AccT logx = std::log(static_cast<AccT>(x));
-  return math::TolerableValue<T>()(static_cast<T>(logx));
-}
-
-// Wrapper of exp function. Use exp(float32) for float16
+namespace {
 template <typename T>
-static __device__ __forceinline__ T Exp(T x) {
-  using AccT = typename details::MPTypeTrait<T>::Type;
-  AccT expx = std::exp(static_cast<AccT>(x));
-  return math::TolerableValue<T>()(static_cast<T>(expx));
-}
-
-// log2(value)
-static inline int Log2Ceil(int value) {
-  int log2_value = 0;
-  while ((1 << log2_value) < value) ++log2_value;
-  return log2_value;
-}
-
-enum class SoftmaxMode { kSoftmax, kLogSoftmax, kCrossEntropy };
-
-/*
-  Hard label cross entropy.
-*/
-template <typename T, bool IgnoreIndex>
-__global__ void CrossEntropyHardLabel(T* loss, const T* softmax,
-                                      const int64_t* labels, const int n,
-                                      const int dim, const int d,
-                                      const int ignore_idx) {
-  int64_t ids = blockIdx.x * blockDim.x + threadIdx.x;
-  int64_t idx_n = ids / d;
-  int64_t idx_d = ids % d;
-
-  // thread ids compute loss[ids] using softmax[idx]
-  if (ids < n * d) {
-    int64_t idx = idx_n * dim * d + labels[ids] * d + idx_d;
-    if (IgnoreIndex == true) {
-      // IgnoreIndex is true
-      if (labels[ids] == ignore_idx) {
-        loss[ids] = static_cast<T>(0.0);
-      } else {
-        loss[ids] = -Log(softmax[idx]);
-      }
-    } else {
-      // IgnoreIndex is false
-      loss[ids] = -Log(softmax[idx]);
-    }
-  }
-}
-
-/*
-  Hard label cross entropy with exp.
-  Input: log softmax
-  Output: loss and exp(input)
-*/
-template <typename T, bool IgnoreIndex>
-__global__ void CrossEntropyExpHardLabel(T* loss, T* softmax,
-                                         const int64_t* labels, const int n,
-                                         const int dim, const int d,
-                                         const int ignore_idx) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int64_t idx_n = idx / (d * dim);
-  int64_t idx_dim = (idx / d) % dim;
-  int64_t idx_d = idx % d;
-  int64_t ids = idx_n * d + idx_d;
-
-  if (idx < n * dim * d) {
-    if (IgnoreIndex == true) {
-      // IgnoreIndex is true
-      if (idx_dim == labels[ids]) {
-        if (labels[ids] == ignore_idx) {
-          loss[ids] = static_cast<T>(0.0);
-        } else {
-          loss[ids] = -softmax[idx];
-        }
-      }
-    } else {
-      // IgnoreIndex is false
-      if (labels[ids] >= 0 && labels[ids] < dim) {
-        if (labels[ids] == idx_dim) {
-          loss[ids] = -softmax[idx];
-        }
-      } else {
-        loss[ids] = static_cast<T>(0.0);
-      }
-    }
-    softmax[idx] = Exp(softmax[idx]);
-  }
-}
-
-/*
-  Core function of softmax with cross entropy forward
-    - softmax, SoftmaxMode=kSoftmax
-    - log softmax, SoftmaxMode=kLogSoftmax
-    - softmax with cross entropy hard label, SoftmaxMode=kCrossEntropy
-  The computation includes
-    - Compute max value: maxvalue_{i} = max_j src_{i,j}
-    - Compute sum of exp: s_{i} = sum_{j}{e^{src_{i,j} - maxvalue_{i}}}
-    - Compute: softmax_{i,j} = e^{src_{i,j} - maxvalue_{i}} / s_{i}
-    - Compute: logsoftmax_{i,j} = src_{i,j} - maxvalue_{i} - log(s_{i})
-    - Compute: loss_{i} = -logsoftmax[i,label[i]] (Hard label)
-  This computation results from following formula:
-    softmax_{i,j} = e^{src_{i,j}} / sum_{j}{e^{src_{i,j}}}
-                  = e^{src_{i,j} - maxvalue_{i}}
-                    / sum_{j}{e^{src_{i,j} - maxvalue_{i}}}
-                  = e^{src_{i,j} - maxvalue_{i}} / s_{i}
-    logsoftmax_{i,j} = log(softmax_{i,j})
-                     = src_{i,j} - maxvalue_{i} - log(s_{i})
-  One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize).
-  For reduction max (sum), firstly compute max (sum) to one warp, then use
-  shuffle api to compute max (sum) in one warp.
-*/
-template <typename T, typename VecT, typename AccT, int Log2Elements,
-          SoftmaxMode mode, bool IgnoreIndex>
-__global__ void WarpSoftmaxForward(T* loss, T* softmax, const T* src,
-                                   const int64_t* label, const int batch_size,
-                                   const int stride, const int element_count,
-                                   const int ignore_index) {
-  constexpr int kDimCeil = 1 << Log2Elements;
-  constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-  constexpr int kVSize = sizeof(VecT) / sizeof(T);
-  constexpr int kIterations = kDimCeil / kWarpSize;
-  constexpr int kIterationsV =
-      (kIterations >= kVSize) ? (kIterations / kVSize) : 1;
-  constexpr int kBatchSize = (kDimCeil <= 128) ? 2 : 1;
-
-  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize;
-
-  // max index to read
-  int idx_max_v[kBatchSize];
-#pragma unroll
-  for (int i = 0; i < kBatchSize; i++) {
-    int idx_max = ((i + first_batch) < batch_size) ? element_count : 0;
-    idx_max_v[i] = idx_max / kVSize;
-  }
-
-  // read data from global memory
-  AccT srcdata[kBatchSize][kIterationsV][kVSize];
-
-#pragma unroll
-  for (int i = 0; i < kBatchSize; ++i) {
-// read data to srcdata: - KVSize==1, - KVSize>1
-#pragma unroll
-    for (int it = 0; it < kIterationsV; ++it) {
-      int src_idx = threadIdx.x + it * kWarpSize;
-      if (kVSize == 1) {
-        if (src_idx < idx_max_v[i]) {
-          srcdata[i][it][0] =
-              static_cast<AccT>(src[(first_batch + i) * stride + src_idx]);
-        } else {
-          srcdata[i][it][0] = -std::numeric_limits<AccT>::infinity();
-        }
-      } else {
-        const VecT* src_v =
-            reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
-        if (src_idx < idx_max_v[i]) {
-          VecT srctmp = src_v[src_idx];
-          const T* srcinptr = reinterpret_cast<const T*>(&srctmp);
-#pragma unroll
-          for (int s = 0; s < kVSize; s++) {
-            srcdata[i][it][s] = static_cast<AccT>(srcinptr[s]);
-          }
-        } else {
-#pragma unroll
-          for (int s = 0; s < kVSize; s++) {
-            srcdata[i][it][s] = -std::numeric_limits<AccT>::infinity();
-          }
-        }
-      }
-    }
-  }
-
-  // compute max value: maxvalue_{i} = max_j src_{i,j}
-  AccT max_value[kBatchSize];
-#pragma unroll
-  for (int i = 0; i < kBatchSize; ++i) {
-    // it = 0
-    AccT valmax = srcdata[i][0][0];
-#pragma unroll
-    for (int s = 1; s < kVSize; ++s) {
-      valmax = (valmax > srcdata[i][0][s]) ? valmax : srcdata[i][0][s];
-    }
-    max_value[i] = valmax;
-
-// it = 1, 2, ...
-#pragma unroll
-    for (int it = 1; it < kIterationsV; ++it) {
-      AccT valmax = srcdata[i][it][0];
-#pragma unroll
-      for (int s = 1; s < kVSize; ++s) {
-        valmax = (valmax > srcdata[i][it][s]) ? valmax : srcdata[i][it][s];
-      }
-      max_value[i] = (max_value[i] > valmax) ? max_value[i] : valmax;
-    }
-  }
-  WarpReduceMax<AccT, kBatchSize, kWarpSize>(max_value);
-
-  // compute sum: s_{i} = sum_{j}{ exp(src_{i,j} - maxvalue_{i} }
-  AccT sum[kBatchSize];
-#pragma unroll
-  for (int i = 0; i < kBatchSize; ++i) {
-    // it = 0
-    if (mode == SoftmaxMode::kLogSoftmax ||
-        mode == SoftmaxMode::kCrossEntropy) {
-      sum[i] = std::exp(srcdata[i][0][0] - max_value[i]);
-    } else {
-      srcdata[i][0][0] = std::exp(srcdata[i][0][0] - max_value[i]);
-      sum[i] = srcdata[i][0][0];
-    }
-#pragma unroll
-    for (int s = 1; s < kVSize; ++s) {
-      if (mode == SoftmaxMode::kLogSoftmax ||
-          mode == SoftmaxMode::kCrossEntropy) {
-        sum[i] += std::exp(srcdata[i][0][s] - max_value[i]);
-      } else {
-        srcdata[i][0][s] = std::exp(srcdata[i][0][s] - max_value[i]);
-        sum[i] += srcdata[i][0][s];
-      }
-    }
-
-// it = 1, 2, ...
-#pragma unroll
-    for (int it = 1; it < kIterationsV; ++it) {
-#pragma unroll
-      for (int s = 0; s < kVSize; ++s) {
-        if (mode == SoftmaxMode::kLogSoftmax ||
-            mode == SoftmaxMode::kCrossEntropy) {
-          sum[i] += std::exp(srcdata[i][it][s] - max_value[i]);
-        } else {
-          srcdata[i][it][s] = std::exp(srcdata[i][it][s] - max_value[i]);
-          sum[i] += srcdata[i][it][s];
-        }
-      }
-    }
-  }
-  WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
-
-// write data
-#pragma unroll
-  for (int i = 0; i < kBatchSize; ++i) {
-    if (mode == SoftmaxMode::kLogSoftmax ||
-        mode == SoftmaxMode::kCrossEntropy) {
-      sum[i] = std::log(sum[i]);
-    }
-
-#pragma unroll
-    for (int it = 0; it < kIterationsV; ++it) {
-      int idx = threadIdx.x + it * kWarpSize;
-      if (kVSize == 1) {  // kVSize==1
-        if (idx < idx_max_v[i]) {
-          if (mode == SoftmaxMode::kLogSoftmax) {  // log softmax
-            softmax[(first_batch + i) * stride + idx] =
-                srcdata[i][it][0] - max_value[i] - sum[i];
-            // softmax with cross entropy hard label
-          } else if (mode == SoftmaxMode::kCrossEntropy) {
-            AccT logsoftmax = srcdata[i][it][0] - max_value[i] - sum[i];
-            // softmax
-            softmax[(first_batch + i) * stride + idx] = std::exp(logsoftmax);
-            // label
-            int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize;
-            if (IgnoreIndex == true) {
-              // IgnoreIndex is true
-              if (label[first_batch + i] == loss_idx) {
-                if (label[first_batch + i] != ignore_index) {
-                  loss[first_batch + i] = -logsoftmax;
-                } else {
-                  loss[first_batch + i] = static_cast<T>(0.0);
-                }
-              }
-            } else {
-              // IgnoreIndex is false
-              if (label[first_batch + i] >= 0 &&
-                  label[first_batch + i] < element_count) {
-                if (label[first_batch + i] == loss_idx) {
-                  loss[first_batch + i] = -logsoftmax;
-                }
-              } else {
-                loss[first_batch + i] = static_cast<T>(0.0);
-              }
-            }
-          } else {  // softmax
-            softmax[(first_batch + i) * stride + idx] =
-                srcdata[i][it][0] / sum[i];
-          }
-        } else {
-          break;
-        }
-      } else {  // KVSize>1
-        VecT* softmax_v =
-            reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
-        VecT tmpdata;
-        T* tmpptr = reinterpret_cast<T*>(&tmpdata);
-#pragma unroll
-        for (int s = 0; s < kVSize; ++s) {
-          if (mode == SoftmaxMode::kLogSoftmax) {  // log softmax
-            tmpptr[s] = srcdata[i][it][s] - max_value[i] - sum[i];
-            // softmax with cross entropy hard label
-          } else if (mode == SoftmaxMode::kCrossEntropy) {
-            AccT logsoftmax = srcdata[i][it][s] - max_value[i] - sum[i];
-            // softmax
-            tmpptr[s] = std::exp(logsoftmax);
-            // label
-            int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize + s;
-            if (IgnoreIndex == true) {
-              // IgnoreIndex is true
-              if (label[first_batch + i] == loss_idx &&
-                  label[first_batch + i] != ignore_index) {
-                loss[first_batch + i] = -logsoftmax;
-              }
-            } else {
-              // IgnoreIndex is false
-              if (label[first_batch + i] >= 0 &&
-                  label[first_batch + i] < element_count) {
-                if (label[first_batch + i] == loss_idx) {
-                  loss[first_batch + i] = -logsoftmax;
-                }
-              } else {
-                loss[first_batch + i] = static_cast<T>(0.0);
-              }
-            }
-          } else {  // softmax
-            tmpptr[s] = srcdata[i][it][s] / sum[i];
-          }
-        }
-        if (idx < idx_max_v[i]) {
-          softmax_v[idx] = tmpdata;
-        } else {
-          break;
-        }
-      }
+__global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
+                                 const int64_t n, const int64_t d,
+                                 const int64_t remain, const int ignore_index) {
+  CUDA_KERNEL_LOOP_TYPE(index, n * remain, int64_t) {
+    int64_t idx_n = index / remain;
+    int64_t idx_remain = index % remain;
+    int64_t tmp = labels[index];
+    if (ignore_index != tmp) {
+      int64_t idx = idx_n * d + tmp * remain + idx_remain;
+      logit_grad[idx] -= static_cast<T>(1.);
     }
   }
 }
 
-#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, VecT, AccT)           \
-  case Log2Elements:                                                  \
-    WarpSoftmaxForward<T, VecT, AccT, Log2Elements, mode,             \
-                       IgnoreIndex><<<blocks, threads, 0, stream>>>(  \
-        loss, softmax, src, label, batch_size, stride, element_count, \
-        ignore_index);                                                \
-    break;
-
-/*
-  Wrapper of softmax with cross entropy forward hard label.
-*/
-template <typename T, SoftmaxMode mode, bool IgnoreIndex>
-void SwitchWarpSoftmaxForward(T* loss, T* softmax, const T* src,
-                              const int64_t* label, const int batch_size,
-                              const int stride, const int element_count,
-                              const int ignore_index, gpuStream_t stream) {
-  using AccT = typename details::MPTypeTrait<T>::Type;
-
-  // use 128 threads per block to maximimize gpu utilization
-  const int Log2Elements = static_cast<int>(Log2Ceil(element_count));
-  const int kDimCeil = 1 << Log2Elements;
-  int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-  int batches_per_warp = (kDimCeil <= 128) ? 2 : 1;
-  constexpr int threads_per_block = 128;
-  int warps_per_block = (threads_per_block / kWarpSize);
-  int batches_per_block = warps_per_block * batches_per_warp;
-  int blocks = (batch_size + batches_per_block - 1) / batches_per_block;
-  dim3 threads(kWarpSize, warps_per_block, 1);
-
-  switch (Log2Elements) {
-    SOFTMAX_WARP_FORWARD_CASE(0, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(1, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(2, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(3, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(4, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(5, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(6, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(7, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(8, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(9, T, AccT);
-    default:
-      break;
-  }
-}
-
-/*
-  Wrapper of softmax with cross entropy hard label.
-  - SwitchWarpSoftmaxForward for small size
-  - cudnn function for large size
-*/
-template <typename T, bool IgnoreIndex>
-static void SoftmaxWithCrossEntropyHardLabel(
-    const platform::CUDADeviceContext& ctx, int rank, int axis,
-    const T* logits_data, const int64_t* labels_data, T* loss_data,
-    T* softmax_data, int N, int dim, int D, const int ignore_index) {
-  auto stream = ctx.stream();
-  constexpr int max_dim = 320;
-  if (D == 1 && dim <= max_dim) {  // small size
-    const SoftmaxMode mode = SoftmaxMode::kCrossEntropy;
-    SwitchWarpSoftmaxForward<T, mode, IgnoreIndex>(
-        loss_data, softmax_data, logits_data, labels_data, N, dim, dim,
-        ignore_index, stream);
-  } else {
-    ScopedTensorDescriptor desc;
-    std::vector<int> tensor_dims = {N, dim, D, 1};
-    DataLayout layout = DataLayout::kNCHW;
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
-#else
-    cudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
-#endif
-
-    auto handle = ctx.cudnn_handle();
-
-#ifdef PADDLE_WITH_HIP
-    auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
-                                 : MIOPEN_SOFTMAX_MODE_CHANNEL;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
-        handle, platform::CudnnDataType<T>::kOne(), descp, logits_data,
-        platform::CudnnDataType<T>::kZero(), descp, softmax_data,
-        MIOPEN_SOFTMAX_LOG, mode));
-#else
-    auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
-                                 : CUDNN_SOFTMAX_MODE_CHANNEL;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
-        handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
-        descp, logits_data, platform::CudnnDataType<T>::kZero(), descp,
-        softmax_data));
-#endif
-    int threads = 128;
-    int blocks = (N * dim * D + threads - 1) / threads;
-    // compute cross entropy, input is log softmax
-    CrossEntropyExpHardLabel<T, IgnoreIndex><<<blocks, threads, 0, stream>>>(
-        loss_data, softmax_data, labels_data, N, dim, D, ignore_index);
-  }
-}
-
-/*
-  Wrapper of softmax with cross entropy grad hard label.
-*/
 template <typename T>
-__global__ void SoftmaxWithCrossEntropyGradHardLabel(
-    T* logits_grad, const T* loss_grad, const int64_t* labels, const int64_t n,
-    const int64_t dim, const int64_t d, const int ignore_index) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int64_t idx_n = idx / (d * dim);
-  int64_t idx_dim = (idx / d) % dim;
-  int64_t idx_d = idx % d;
-  int64_t ids = idx_n * d + idx_d;
-
-  if (idx < n * dim * d) {
-    if (labels[ids] == ignore_index) {
-      logits_grad[idx] = static_cast<T>(0.0);
-    } else if (labels[ids] == idx_dim) {
-      logits_grad[idx] =
-          (logits_grad[idx] - static_cast<T>(1.0)) * loss_grad[ids];
+__global__ void Scale(T* logit_grad, const T* loss_grad, const int64_t num,
+                      const int64_t d, const int64_t remain,
+                      const int64_t* labels, const int ignore_index) {
+  CUDA_KERNEL_LOOP_TYPE(index, num, int64_t) {
+    int64_t idx_n = index / d;
+    int64_t idx_remain = index % remain;
+    int64_t idx_lbl = idx_n * remain + idx_remain;
+    if (labels[idx_lbl] == ignore_index) {
+      logit_grad[index] = static_cast<T>(0.);
     } else {
-      logits_grad[idx] *= loss_grad[ids];
+      logit_grad[index] *= loss_grad[idx_lbl];
     }
   }
 }
@@ -560,6 +123,8 @@ __global__ void ScaleCrossEntropyGradient(T* logit_grad, const T* loss_grad,
   }
 }
 
+}  // namespace
+
 static __device__ __forceinline__ platform::float16 exp_on_device(
     platform::float16 x) {
   return ::Eigen::numext::exp(x);
@@ -831,6 +396,278 @@ static __global__ void RowReductionForCrossEntropy(const T* logits_data,
   if (threadIdx.x == 0) loss_data[blockIdx.x] = loss;
 }
 
+template <typename T>
+struct HardLabelCrossEntropyFunctor {
+ public:
+  HardLabelCrossEntropyFunctor(const int64_t* labels, T* loss,
+                               const T* logits_data, int d, int axis_dim)
+      : labels_(labels),
+        loss_(loss),
+        logits_data_(logits_data),
+        d_(d),
+        axis_dim_(axis_dim) {}
+
+  __device__ void operator()(int idx) const {
+    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
+    int remain = d_ / axis_dim_;
+    int idx_n = idx / d_;
+    int idx_axis = (idx % d_) / remain;
+    int idx_remain = idx % remain;
+    // labels, loss view as [n, remain]
+    int idx_lbl = idx_n * remain + idx_remain;
+    // It also would ignore labels not in range(class_num).
+    if (idx_axis != labels_[idx_lbl]) {
+    } else {
+      loss_[idx_lbl] = -log_on_device(logits_data_[idx]);
+    }
+  }
+
+ private:
+  const int64_t* labels_;
+  T* loss_;
+  const T* logits_data_;
+  int d_;
+  int axis_dim_;
+};
+
+template <typename T>
+struct HardLabelCrossEntropyFunctorWithIgnoreIdx {
+ public:
+  HardLabelCrossEntropyFunctorWithIgnoreIdx(const int64_t* labels, T* loss,
+                                            const T* logits_data, int d,
+                                            int axis_dim, int ignore_idx)
+      : labels_(labels),
+        loss_(loss),
+        logits_data_(logits_data),
+        d_(d),
+        axis_dim_(axis_dim),
+        ignore_idx_(ignore_idx) {}
+
+  __device__ void operator()(int idx) const {
+    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
+    int remain = d_ / axis_dim_;
+    int idx_n = idx / d_;
+    int idx_axis = (idx % d_) / remain;
+    int idx_remain = idx % remain;
+    // labels, loss view as [n, remain]
+    int idx_lbl = idx_n * remain + idx_remain;
+
+    if (idx_axis == labels_[idx_lbl] && idx_axis != ignore_idx_) {
+      loss_[idx_lbl] = -log_on_device(logits_data_[idx]);
+    }
+  }
+
+ private:
+  const int64_t* labels_;
+  T* loss_;
+  const T* logits_data_;
+  int d_;
+  int axis_dim_;
+  int ignore_idx_;
+};
+
+template <typename T>
+static void HardLabelCrossEntropy(const platform::CUDADeviceContext& ctx,
+                                  const T* logits_data,
+                                  const int64_t* labels_data, T* loss_data,
+                                  int n, int d, int axis_dim, int ignore_idx) {
+  constexpr int kMaxBlockDim = 512;
+  int block_dim = axis_dim >= kMaxBlockDim
+                      ? kMaxBlockDim
+                      : (1 << static_cast<int>(std::log2(axis_dim)));
+  int grid_dim = n * d / axis_dim;
+  auto stream = ctx.stream();
+
+#define CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)                \
+  case BlockDim: {                                                          \
+    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, n* d);   \
+    if (ignore_idx >= 0 && ignore_idx < axis_dim) {                         \
+      for_range(HardLabelCrossEntropyFunctorWithIgnoreIdx<T>(               \
+          labels_data, loss_data, logits_data, d, axis_dim, ignore_idx));   \
+    } else {                                                                \
+      for_range(HardLabelCrossEntropyFunctor<T>(labels_data, loss_data,     \
+                                                logits_data, d, axis_dim)); \
+    }                                                                       \
+  } break
+
+  switch (block_dim) {
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(512);
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(256);
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(128);
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(64);
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(32);
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(16);
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(8);
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(4);
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(2);
+    default:
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Block Dimension must be 2^n in softmax_with_cross_entropy_op."));
+      break;
+  }
+#undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
+}
+
+template <typename T>
+struct HardLabelSoftmaxWithCrossEntropyFunctor {
+ public:
+  HardLabelSoftmaxWithCrossEntropyFunctor(const int64_t* labels, T* loss,
+                                          T* log_softmax, int64_t d,
+                                          int axis_dim, int ignore_idx)
+      : labels_(labels),
+        loss_(loss),
+        log_softmax_(log_softmax),
+        d_(d),
+        axis_dim_(axis_dim),
+        ignore_idx_(ignore_idx) {}
+
+  __device__ void operator()(int64_t idx) const {
+    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
+    int64_t remain = d_ / axis_dim_;
+    int64_t idx_n = idx / d_;
+    int64_t idx_axis = (idx % d_) / remain;
+    int64_t idx_remain = idx % remain;
+    // labels, loss view as [n, remain]
+    int64_t idx_lbl = idx_n * remain + idx_remain;
+    PADDLE_ENFORCE(labels_[idx_lbl] >= 0 && labels_[idx_lbl] < d_ ||
+                       labels_[idx_lbl] == ignore_idx_,
+                   "The value of label[%ld] expected >= 0 and < %ld, or == %d,"
+                   "but got %ld. Please check input value.",
+                   idx_lbl, d_, ignore_idx_, labels_[idx_lbl]);
+    // It also would ignore labels not in range(class_num).
+    if (idx_axis != labels_[idx_lbl]) {
+      log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
+    } else {
+      auto softmax = log_softmax_[idx];
+      log_softmax_[idx] = exp_on_device(softmax);
+      loss_[idx_lbl] = -softmax;
+    }
+  }
+
+ private:
+  const int64_t* labels_;
+  T* loss_;
+  T* log_softmax_;
+  int64_t d_;
+  int axis_dim_;
+  int ignore_idx_;
+};
+
+template <typename T>
+struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
+ public:
+  HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx(const int64_t* labels,
+                                                       T* loss, T* log_softmax,
+                                                       int64_t d, int axis_dim,
+                                                       int ignore_idx)
+      : labels_(labels),
+        loss_(loss),
+        log_softmax_(log_softmax),
+        d_(d),
+        axis_dim_(axis_dim),
+        ignore_idx_(ignore_idx) {}
+
+  __device__ void operator()(int64_t idx) const {
+    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
+    int64_t remain = d_ / axis_dim_;
+    int64_t idx_n = idx / d_;
+    int64_t idx_axis = (idx % d_) / remain;
+    int64_t idx_remain = idx % remain;
+    // labels, loss view as [n, remain]
+    int64_t idx_lbl = idx_n * remain + idx_remain;
+    if (idx_axis != labels_[idx_lbl] || idx_axis == ignore_idx_) {
+      log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
+    } else {
+      auto softmax = log_softmax_[idx];
+      log_softmax_[idx] = exp_on_device(softmax);
+      loss_[idx_lbl] = -softmax;
+    }
+  }
+
+ private:
+  const int64_t* labels_;
+  T* loss_;
+  T* log_softmax_;
+  int64_t d_;
+  int axis_dim_;
+  int ignore_idx_;
+};
+
+template <typename T>
+static void HardLabelSoftmaxWithCrossEntropy(
+    const platform::CUDADeviceContext& ctx, const T* logits_data,
+    const int64_t* labels_data, T* loss_data, T* softmax_data, int64_t n,
+    int64_t d, int axis_dim, int ignore_idx) {
+#ifdef __HIPCC__
+  // HIP platform will have loss nan if dim size > 256
+  constexpr int kMaxBlockDim = 256;
+#else
+  constexpr int kMaxBlockDim = 512;
+#endif
+  int64_t block_dim = axis_dim >= kMaxBlockDim
+                          ? kMaxBlockDim
+                          : (1 << static_cast<int>(std::log2(axis_dim)));
+  int64_t grid_dim = n * d / axis_dim;
+  auto stream = ctx.stream();
+
+#ifdef __HIPCC__
+#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)      \
+  case BlockDim: {                                                             \
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForMax<T, BlockDim>),       \
+                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
+                       loss_data, d, axis_dim);                                \
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForSum<T, BlockDim>),       \
+                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
+                       loss_data, softmax_data, d, axis_dim);                  \
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForDiff<T, BlockDim>),      \
+                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
+                       loss_data, softmax_data, d, axis_dim);                  \
+    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, n* d);      \
+    if (ignore_idx >= 0 && ignore_idx < axis_dim) {                            \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>(       \
+          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx));     \
+    } else {                                                                   \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>(                    \
+          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx));     \
+    }                                                                          \
+  } break
+#else
+#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)  \
+  case BlockDim: {                                                         \
+    RowReductionForMax<T, BlockDim><<<grid_dim, BlockDim, 0, stream>>>(    \
+        logits_data, loss_data, d, axis_dim);                              \
+    RowReductionForDiffMaxSum<T, BlockDim,                                 \
+                              true><<<grid_dim, BlockDim, 0, stream>>>(    \
+        logits_data, loss_data, softmax_data, d, axis_dim);                \
+    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, n* d);  \
+    if (ignore_idx >= 0 && ignore_idx < axis_dim) {                        \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>(   \
+          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx)); \
+    } else {                                                               \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>(                \
+          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx)); \
+    }                                                                      \
+  } break
+#endif
+
+  switch (block_dim) {
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(256);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(128);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(64);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(32);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(16);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(8);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
+    default:
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Block Dimension must be 2^n in softmax_with_cross_entropy_op."));
+      break;
+  }
+#undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
+}
+
 template <typename T>
 static void SoftmaxWithCrossEntropyFusedKernel(
     const T* logits_data, const T* labels_data, T* softmax_data, T* loss_data,
@@ -946,7 +783,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
 
       const int rank = softmax->dims().size();
       const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-      const int axis_dim = softmax->dims()[axis];
+      int axis_dim = softmax->dims()[axis];
 
       const int n = SizeToAxis(axis, softmax->dims());
       const int d = SizeFromAxis(axis, softmax->dims());
@@ -989,19 +826,9 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
       } else {  // HardLabel
         auto* logits_data = softmax->data<T>();
         auto* labels_data = labels->data<int64_t>();
-        int threads = 128;
-        int blocks = (n * d / axis_dim + threads - 1) / threads;
-        if (ignore_index >= 0 && ignore_index < axis_dim) {
-          CrossEntropyHardLabel<T, true><<<
-              blocks, threads, 0, context.cuda_device_context().stream()>>>(
-              loss_data, logits_data, labels_data, n, axis_dim, d / axis_dim,
-              ignore_index);
-        } else {
-          CrossEntropyHardLabel<T, false><<<
-              blocks, threads, 0, context.cuda_device_context().stream()>>>(
-              loss_data, logits_data, labels_data, n, axis_dim, d / axis_dim,
-              ignore_index);
-        }
+        HardLabelCrossEntropy<T>(context.cuda_device_context(), logits_data,
+                                 labels_data, loss_data, n, d, axis_dim,
+                                 ignore_index);
       }
 
       // cause of input is softmax
@@ -1059,17 +886,9 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
       } else {
         auto* logits_data = logits->data<T>();
         auto* labels_data = labels->data<int64_t>();
-        if (ignore_index >= 0 && ignore_index < axis_dim) {
-          SoftmaxWithCrossEntropyHardLabel<T, true>(
-              context.cuda_device_context(), rank, axis, logits_data,
-              labels_data, loss_data, softmax_data, n, axis_dim, d / axis_dim,
-              ignore_index);
-        } else {
-          SoftmaxWithCrossEntropyHardLabel<T, false>(
-              context.cuda_device_context(), rank, axis, logits_data,
-              labels_data, loss_data, softmax_data, n, axis_dim, d / axis_dim,
-              ignore_index);
-        }
+        HardLabelSoftmaxWithCrossEntropy<T>(
+            context.cuda_device_context(), logits_data, labels_data, loss_data,
+            softmax_data, n, d, axis_dim, ignore_index);
       }
     }
   }
@@ -1140,11 +959,14 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
       SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
           logit_grad_data, loss_grad_data, label_data, n, d, remain);
     } else {
+      int64_t grid = (n * remain + block - 1) / block;
       const int64_t* label_data = labels->data<int64_t>();
-      int grid = (n * d + block - 1) / block;
-      SoftmaxWithCrossEntropyGradHardLabel<T><<<grid, block, 0, stream>>>(
-          logit_grad_data, loss_grad_data, label_data, n, d / remain, remain,
-          ignore_index);
+      CrossEntropyGrad<T><<<grid, block, 0, stream>>>(
+          logit_grad_data, label_data, n, d, remain, ignore_index);
+      int64_t num = n * d;
+      grid = (num + block - 1) / block;
+      Scale<T><<<grid, block, 0, stream>>>(logit_grad_data, loss_grad_data, num,
+                                           d, remain, label_data, ignore_index);
     }
   }
 };
diff --git a/paddle/fluid/operators/strided_slice_op.cc b/paddle/fluid/operators/strided_slice_op.cc
index d71be60e1f5c22..f8272d550b9991 100644
--- a/paddle/fluid/operators/strided_slice_op.cc
+++ b/paddle/fluid/operators/strided_slice_op.cc
@@ -324,6 +324,7 @@ REGISTER_OPERATOR(strided_slice_grad, ops::StridedSliceOpGrad,
 
 REGISTER_OP_CPU_KERNEL(
     strided_slice,
+    ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, int>,
     ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, float>,
@@ -335,6 +336,7 @@ REGISTER_OP_CPU_KERNEL(
 
 REGISTER_OP_CPU_KERNEL(
     strided_slice_grad,
+    ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/strided_slice_op.cu b/paddle/fluid/operators/strided_slice_op.cu
index 68a8312f0818d4..f88605fbfc86dc 100644
--- a/paddle/fluid/operators/strided_slice_op.cu
+++ b/paddle/fluid/operators/strided_slice_op.cu
@@ -18,6 +18,7 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     strided_slice,
+    ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, int>,
     ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, float>,
@@ -29,7 +30,8 @@ REGISTER_OP_CUDA_KERNEL(
 
 REGISTER_OP_CUDA_KERNEL(
     strided_slice_grad,
-    ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, double>,
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index a7d7ea260ecdf4..07749f90ebaa29 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 #endif
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/float16.h"
@@ -563,15 +564,19 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
     const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
     auto e_indices =
         framework::EigenMatrix<int64_t>::From(*indices_tensor, dim);
-    auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(temp_indices);
+    auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(
+        static_cast<const Tensor>(temp_indices));
 
     std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
     auto dim = framework::make_ddim(odims);
     auto e_values = framework::EigenMatrix<T>::From(*out_tensor, dim);
-    auto e_tmp_values = framework::EigenMatrix<T>::From(temp_values);
+    auto e_tmp_values =
+        framework::EigenMatrix<T>::From(static_cast<const Tensor>(temp_values));
 
-    e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
-    e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
+    EigenSlice<std::decay_t<decltype(dev)>, int64_t, 2>::Eval(
+        dev, e_indices, e_tmp_indices, slice_indices, slice_sizes);
+    EigenSlice<std::decay_t<decltype(dev)>, T, 2>::Eval(
+        dev, e_values, e_tmp_values, slice_indices, slice_sizes);
   }
   return true;
 }
diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc
index 8fb0b3809503ec..3e943c62e1ce17 100644
--- a/paddle/fluid/operators/tril_triu_op.cc
+++ b/paddle/fluid/operators/tril_triu_op.cc
@@ -105,13 +105,15 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker,
                   ops::TrilTriuGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp);
 REGISTER_OP_CPU_KERNEL(
-    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, float>,
+    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, float>,
     ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, double>,
     ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int>,
     ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::TrilTriuOpKernel<paddle::platform::CPUDeviceContext, plat::float16>);
 REGISTER_OP_CPU_KERNEL(
     tril_triu_grad,
+    ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, float>,
     ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, double>,
     ops::TrilTriuGradOpKernel<paddle::platform::CPUDeviceContext, int>,
diff --git a/paddle/fluid/operators/tril_triu_op.cu b/paddle/fluid/operators/tril_triu_op.cu
index d04acd34059792..9cbbdeeb2ce284 100644
--- a/paddle/fluid/operators/tril_triu_op.cu
+++ b/paddle/fluid/operators/tril_triu_op.cu
@@ -18,7 +18,7 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 REGISTER_OP_CUDA_KERNEL(
-    tril_triu,
+    tril_triu, ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, float>,
     ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, double>,
     ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, int>,
@@ -26,6 +26,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::TrilTriuOpKernel<paddle::platform::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     tril_triu_grad,
+    ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, bool>,
     ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, float>,
     ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, double>,
     ops::TrilTriuGradOpKernel<paddle::platform::CUDADeviceContext, int>,
diff --git a/paddle/fluid/platform/complex128.h b/paddle/fluid/platform/complex128.h
deleted file mode 100644
index da2f83c3497cce..00000000000000
--- a/paddle/fluid/platform/complex128.h
+++ /dev/null
@@ -1,535 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <stdint.h>
-
-#include <complex>
-#include <cstring>
-#include <iostream>
-#include <limits>
-
-#ifdef PADDLE_WITH_CUDA
-#include <cuComplex.h>
-#include <thrust/complex.h>
-#endif  // PADDLE_WITH_CUDA
-
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_complex.h>
-#include <thrust/complex.h>  // NOLINT
-#endif
-
-#if !defined(_WIN32)
-#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
-#else
-#define PADDLE_ALIGN(x) __declspec(align(x))
-#endif
-
-#if (defined(__CUDACC__) || defined(__HIPCC__))
-#define HOSTDEVICE __host__ __device__
-#define DEVICE __device__
-#define HOST __host__
-#else
-#define HOSTDEVICE
-#define DEVICE
-#define HOST
-#endif
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX128
-#endif
-
-namespace paddle {
-namespace platform {
-
-struct PADDLE_ALIGN(16) complex128 {
- public:
-  double real;
-  double imag;
-
-  complex128() = default;
-  complex128(const complex128& o) = default;
-  complex128& operator=(const complex128& o) = default;
-  complex128(complex128&& o) = default;
-  complex128& operator=(complex128&& o) = default;
-  ~complex128() = default;
-
-  HOSTDEVICE complex128(double real, double imag) : real(real), imag(imag) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-  HOSTDEVICE inline explicit complex128(const thrust::complex<double>& c) {
-    real = c.real();
-    imag = c.imag();
-  }
-
-  HOSTDEVICE inline explicit operator thrust::complex<double>() const {
-    return thrust::complex<double>(real, imag);
-  }
-
-#ifdef PADDLE_WITH_HIP
-  HOSTDEVICE inline explicit operator hipDoubleComplex() const {
-    return make_hipDoubleComplex(real, imag);
-  }
-#else
-  HOSTDEVICE inline explicit operator cuDoubleComplex() const {
-    return make_cuDoubleComplex(real, imag);
-  }
-#endif
-#endif
-
-  HOSTDEVICE complex128(const float& val)
-      : real(static_cast<double>(val)), imag(0) {}
-  HOSTDEVICE complex128(const double& val) : real(val), imag(0) {}
-  HOSTDEVICE complex128(const int& val)
-      : real(static_cast<double>(val)), imag(0) {}
-  HOSTDEVICE complex128(const int64_t& val)
-      : real(static_cast<double>(val)), imag(0) {}
-
-  HOSTDEVICE inline explicit operator std::complex<double>() {
-    return static_cast<std::complex<double>>(std::complex<double>(real, imag));
-  }
-
-  template <class T>
-  HOSTDEVICE inline explicit complex128(const T& val)
-      : real(complex128(static_cast<double>(val)).real) {}
-
-  HOSTDEVICE complex128(const std::complex<double> val)
-      : real(val.real()), imag(val.imag()) {}
-
-  HOSTDEVICE inline complex128& operator=(bool b) {
-    real = b ? 1 : 0;
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(int8_t val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(uint8_t val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(int16_t val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(uint16_t val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(int32_t val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(uint32_t val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(int64_t val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(uint64_t val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(float val) {
-    real = val;
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(double val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline operator float() const {
-    return static_cast<float>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator bool() const {
-    return static_cast<bool>(this->real) || static_cast<bool>(this->imag);
-  }
-
-  HOSTDEVICE inline explicit operator int8_t() const {
-    return static_cast<int8_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint8_t() const {
-    return static_cast<uint8_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator int16_t() const {
-    return static_cast<int16_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint16_t() const {
-    return static_cast<uint16_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator int32_t() const {
-    return static_cast<int32_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint32_t() const {
-    return static_cast<uint32_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator int64_t() const {
-    return static_cast<int64_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint64_t() const {
-    return static_cast<uint64_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator double() const {
-    return static_cast<double>(this->real);
-  }
-};
-
-HOSTDEVICE inline complex128 operator+(const complex128& a,
-                                       const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(thrust::complex<double>(a.real, a.imag) +
-                    thrust::complex<double>(b.real, b.imag));
-#else
-  return complex128(a.real + b.real, a.imag + b.imag);
-#endif
-}
-
-HOSTDEVICE inline complex128 operator-(const complex128& a,
-                                       const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(thrust::complex<double>(a.real, a.imag) -
-                    thrust::complex<double>(b.real, b.imag));
-#else
-  return complex128(a.real - b.real, a.imag - b.imag);
-#endif
-}
-
-HOSTDEVICE inline complex128 operator*(const complex128& a,
-                                       const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(thrust::complex<double>(a.real, a.imag) *
-                    thrust::complex<double>(b.real, b.imag));
-#else
-  return complex128(a.real * b.real - a.imag * b.imag,
-                    a.imag * b.real + b.imag * a.real);
-#endif
-}
-
-HOSTDEVICE inline complex128 operator/(const complex128& a,
-                                       const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(thrust::complex<double>(a.real, a.imag) /
-                    thrust::complex<double>(b.real, b.imag));
-#else
-  double denominator = b.real * b.real + b.imag * b.imag;
-  return complex128((a.real * b.real + a.imag * b.imag) / denominator,
-                    (a.imag * b.real - a.real * b.imag) / denominator);
-#endif
-}
-
-HOSTDEVICE inline complex128 operator-(const complex128& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(-thrust::complex<double>(a.real, a.imag));
-#else
-  complex128 res;
-  res.real = -a.real;
-  res.imag = -a.imag;
-  return res;
-#endif
-}
-
-HOSTDEVICE inline complex128& operator+=(complex128& a,  // NOLINT
-                                         const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex128(thrust::complex<double>(a.real, a.imag) +=
-                 thrust::complex<double>(b.real, b.imag));
-  return a;
-#else
-  a.real += b.real;
-  a.imag += b.imag;
-  return a;
-#endif
-}
-
-HOSTDEVICE inline complex128& operator-=(complex128& a,  // NOLINT
-                                         const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex128(thrust::complex<double>(a.real, a.imag) -=
-                 thrust::complex<double>(b.real, b.imag));
-  return a;
-#else
-  a.real -= b.real;
-  a.imag -= b.imag;
-  return a;
-#endif
-}
-
-HOSTDEVICE inline complex128& operator*=(complex128& a,  // NOLINT
-                                         const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex128(thrust::complex<double>(a.real, a.imag) *=
-                 thrust::complex<double>(b.real, b.imag));
-  return a;
-#else
-  a.real = a.real * b.real - a.imag * b.imag;
-  a.imag = a.imag * b.real + b.imag * a.real;
-  return a;
-#endif
-}
-
-HOSTDEVICE inline complex128& operator/=(complex128& a,  // NOLINT
-                                         const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex128(thrust::complex<double>(a.real, a.imag) /=
-                 thrust::complex<double>(b.real, b.imag));
-  return a;
-#else
-  double denominator = b.real * b.real + b.imag * b.imag;
-  a.real = (a.real * b.real + a.imag * b.imag) / denominator;
-  a.imag = (a.imag * b.real - a.real * b.imag) / denominator;
-  return a;
-#endif
-}
-
-HOSTDEVICE inline complex128 raw_uint16_to_complex128(uint16_t a) {
-  complex128 res;
-  res.real = a;
-  return res;
-}
-
-HOSTDEVICE inline bool operator==(const complex128& a, const complex128& b) {
-  return a.real == b.real && a.imag == b.imag;
-}
-
-HOSTDEVICE inline bool operator!=(const complex128& a, const complex128& b) {
-  return a.real != b.real || a.imag != b.imag;
-}
-
-HOSTDEVICE inline bool operator<(const complex128& a, const complex128& b) {
-  return static_cast<double>(a.real) < static_cast<double>(b.real);
-}
-
-HOSTDEVICE inline bool operator<=(const complex128& a, const complex128& b) {
-  return static_cast<double>(a.real) <= static_cast<double>(b.real);
-}
-
-HOSTDEVICE inline bool operator>(const complex128& a, const complex128& b) {
-  return static_cast<double>(a.real) > static_cast<double>(b.real);
-}
-
-HOSTDEVICE inline bool operator>=(const complex128& a, const complex128& b) {
-  return static_cast<double>(a.real) >= static_cast<double>(b.real);
-}
-
-HOSTDEVICE inline bool(isnan)(const complex128& a) {
-#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
-  // __isnanf not supported on HIP platform
-  return __isnan(a.real) || __isnan(a.imag);
-#else
-  return std::isnan(a.real) || std::isnan(a.imag);
-#endif
-}
-
-HOSTDEVICE inline bool(isinf)(const complex128& a) {
-#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
-  // __isinf not supported on HIP platform
-  return __isinf(a.real) || __isinf(a.imag);
-#else
-  return std::isinf(a.real) || std::isinf(a.imag);
-#endif
-}
-
-HOSTDEVICE inline bool(isfinite)(const complex128& a) {
-  return !((isnan)(a)) && !((isinf)(a));
-}
-
-HOSTDEVICE inline double(abs)(const complex128& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return thrust::abs(thrust::complex<double>(a.real, a.imag));
-#else
-  return std::abs(std::complex<double>(a.real, a.imag));
-#endif
-}
-
-HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(thrust::pow(thrust::complex<double>(a.real, a.imag),
-                                thrust::complex<double>(b.real, b.imag)));
-#else
-  return std::pow(std::complex<double>(a), std::complex<float>(b));
-#endif
-}
-
-HOSTDEVICE inline complex128(sqrt)(const complex128& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(thrust::sqrt(thrust::complex<double>(a.real, a.imag)));
-#else
-  return std::sqrt(std::complex<double>(a));
-#endif
-}
-
-HOSTDEVICE inline complex128(tanh)(const complex128& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(thrust::tanh(thrust::complex<double>(a.real, a.imag)));
-#else
-  return std::tanh(std::complex<double>(a));
-#endif
-}
-
-HOSTDEVICE inline complex128(log)(const complex128& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(thrust::log(thrust::complex<double>(a.real, a.imag)));
-#else
-  return complex128(std::log(std::complex<double>(a)));
-#endif
-}
-
-inline std::ostream& operator<<(std::ostream& os, const complex128& a) {
-  os << "real:" << a.real << " imag:" << a.imag;
-  return os;
-}
-
-}  // namespace platform
-}  // namespace paddle
-
-namespace std {
-
-template <>
-struct is_pod<paddle::platform::complex128> {
-  static const bool value =
-      is_trivial<paddle::platform::complex128>::value &&
-      is_standard_layout<paddle::platform::complex128>::value;
-};
-
-template <>
-struct is_floating_point<paddle::platform::complex128>
-    : std::integral_constant<
-          bool, std::is_same<paddle::platform::complex128,
-                             typename std::remove_cv<
-                                 paddle::platform::complex128>::type>::value> {
-};
-template <>
-struct is_signed<paddle::platform::complex128> {
-  static const bool value = false;
-};
-
-template <>
-struct is_unsigned<paddle::platform::complex128> {
-  static const bool value = false;
-};
-
-inline bool isnan(const paddle::platform::complex128& a) {
-  return paddle::platform::isnan(a);
-}
-
-inline bool isinf(const paddle::platform::complex128& a) {
-  return paddle::platform::isinf(a);
-}
-
-template <>
-struct numeric_limits<paddle::platform::complex128> {
-  static const bool is_specialized = false;
-  static const bool is_signed = false;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = false;
-  static const bool has_quiet_NaN = false;
-  static const bool has_signaling_NaN = false;
-  static const float_denorm_style has_denorm = denorm_absent;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = std::round_toward_zero;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = false;
-  static const bool is_modulo = false;
-  static const int digits = 0;
-  static const int digits10 = 0;
-  static const int max_digits10 = 0;
-  static const int radix = 0;
-  static const int min_exponent = 0;
-  static const int min_exponent10 = 0;
-  static const int max_exponent = 0;
-  static const int max_exponent10 = 0;
-  static const bool traps = false;
-  static const bool tinyness_before = false;
-
-  static paddle::platform::complex128(min)() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-  static paddle::platform::complex128 lowest() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-  static paddle::platform::complex128(max)() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-  static paddle::platform::complex128 epsilon() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-  static paddle::platform::complex128 round_error() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-  static paddle::platform::complex128 infinity() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-  static paddle::platform::complex128 quiet_NaN() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-  static paddle::platform::complex128 signaling_NaN() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-  static paddle::platform::complex128 denorm_min() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-};
-
-}  // namespace std
-
-#define MKL_Complex16 paddle::platform::complex128
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index 4095720f71eb71..352143302388a9 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -31,6 +31,7 @@ namespace platform {
 #endif
 
 inline static int RoundToPowerOfTwo(int dim) {
+#ifdef PADDLE_WITH_CUDA
   if (dim > 512) {
     return 1024;
   } else if (dim > 256) {
@@ -44,6 +45,17 @@ inline static int RoundToPowerOfTwo(int dim) {
   } else {
     return 32;
   }
+#else  // HIP results in error or nan if > 256
+  if (dim > 128) {
+    return 256;
+  } else if (dim > 64) {
+    return 128;
+  } else if (dim > 32) {
+    return 64;
+  } else {
+    return 32;
+  }
+#endif
 }
 
 #define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h
index 05a431e731e32c..8e969588afbbcf 100644
--- a/paddle/fluid/platform/cudnn_desc.h
+++ b/paddle/fluid/platform/cudnn_desc.h
@@ -79,6 +79,11 @@ inline cudnnDataType_t ToCudnnDataType(
     case framework::proto::VarType::FP64:
       type = CUDNN_DATA_DOUBLE;
       break;
+#if CUDNN_VERSION_MIN(8, 1, 0)
+    case framework::proto::VarType::BF16:
+      type = CUDNN_DATA_BFLOAT16;
+      break;
+#endif
     default:
       break;
   }
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 0d2a770ad8276b..65dd69a37d37f8 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -102,6 +102,25 @@ inline ActivationMode StringToActivationMode(const std::string& str) {
 template <typename T>
 class CudnnDataType;
 
+// CUDNN_DATA_BFLOAT16 is not valid before cudnn8.1
+#if CUDNN_VERSION_MIN(8, 1, 0)
+template <>
+class CudnnDataType<bfloat16> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_BFLOAT16;
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+#endif
+
 template <>
 class CudnnDataType<float16> {
  public:
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 7e983eb54ae2cd..1179677fd6b9f5 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -563,7 +563,7 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
     : CPUDeviceContext(place), p_blobmap_() {
   p_blobmap_.reset(new BlobMap());
-  p_exec_items_.reset(new ExecMap());
+  p_exec_items_.reset(new ExecShape());
   p_mutex_.reset(new std::mutex());
 }
 
@@ -644,10 +644,15 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
     if (ptr == nullptr) {
       p_blobmap_->clear();
     } else {
-      for (auto& v : (*p_exec_items_)[ptr]) {
-        (v.first)->erase(v.second);
+      // Iterate through all shapes and release
+      // for each shape and active executor all entries
+      // of this executor
+      for (auto& s : *p_exec_items_) {
+        for (auto& v : (*s.second)[ptr]) {
+          (v.first)->erase(v.second);
+        }
+        s.second->erase(ptr);
       }
-      p_exec_items_->erase(ptr);
     }
   } else {
     VLOG(3) << "Prevented Clearing DNNL cache.";
@@ -655,11 +660,24 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
   }
 }
 
+void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor(void) const {
+  p_exec_items_->erase(p_exec_items_->begin());
+}
+
 void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
                                                 KeyBlob::iterator it) const {
+  // Take current input shape from TLS
   // Take current executor addess from TLS
   // and for this executor's items add the one defined with arguments
-  (*p_exec_items_)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));
+  auto key_it = p_exec_items_
+                    ->insert(std::make_pair(tls().cur_input_shape_str,
+                                            std::make_shared<ExecMap>()))
+                    .first;
+  (*key_it->second)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));
+
+  VLOG(3) << "LinkEntryWithExecutor, shapes: " << p_exec_items_->size()
+          << " curr exec size: "
+          << (*key_it->second)[tls().get_curr_exec()].size() << "\n";
 }
 
 void MKLDNNDeviceContext::BlockNextCacheClearing() {
@@ -716,6 +734,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
       VLOG(2) << "sid=" << sid
               << ", remove all blobs of shape: " << sBlob->begin()->first;
       sBlob->erase(sBlob->begin()->first);
+      RemoveShapeEntriesWithExecutor();
     }
     pBlob = std::make_shared<KeyBlob>();
     (*sBlob)[tls().cur_input_shape_str] = pBlob;
@@ -739,7 +758,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
   return;
 }
 
-unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) {
+unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const {
   unsigned int num_entries = 0;
   for (auto const& l3 : *p_blobmap_) {
     for (auto const& l2 : *(l3.second)) {
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index e62f0673e97fad..e2dbc90b5d1444 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -358,15 +358,16 @@ class CUDAContext {
       PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
           &miopen_major, &miopen_minor, &miopen_patch));
       auto local_miopen_version =
-          (miopen_major * 1000 + miopen_minor * 100 + miopen_patch) / 100;
-      auto compile_miopen_version = MIOPEN_VERSION / 100;
+          (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
+      auto compile_miopen_version = MIOPEN_VERSION / 10;
       if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
         LOG_FIRST_N(WARNING, 1)
             << "WARNING: device: " << place_.device
             << ". The installed Paddle is compiled with MIOPEN "
-            << compile_miopen_version / 10 << "." << compile_miopen_version % 10
+            << compile_miopen_version / 100 << "."
+            << compile_miopen_version % 100
             << ", but MIOPEN version in your machine is "
-            << local_miopen_version / 10 << "." << local_miopen_version % 10
+            << local_miopen_version / 100 << "." << local_miopen_version % 100
             << ", which may cause serious incompatible bug. "
             << "Please recompile or reinstall Paddle with compatible MIOPEN "
                "version.";
@@ -748,8 +749,14 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   using ShapeBlob = umap_key_string_t<KeyBlob>;
   using BlobMap = umap_value_smart_t<int, ShapeBlob>;
 
-  using ExecMap = std::unordered_map<
-      void*, std::vector<std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>>>;
+  // Auxillary two-level structure (shape, executor) to easier control
+  // clearing cache objects related to specific executor
+
+  using ExecKey = void*;
+  using ExecMapCacheIterPair = std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>;
+  using ExecMap =
+      std::unordered_map<ExecKey, std::vector<ExecMapCacheIterPair>>;
+  using ExecShape = std::unordered_map<std::string, std::shared_ptr<ExecMap>>;
 
   explicit MKLDNNDeviceContext(CPUPlace place);
 
@@ -758,6 +765,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
 
   // Register object to currently used executor's map
   void LinkEntryWithExecutor(BlobPtr_t<KeyBlob>, KeyBlob::iterator) const;
+  void RemoveShapeEntriesWithExecutor(void) const;
 
   // Remove all entries from the blob map
   void ResetBlobMap(void* ptr);
@@ -772,7 +780,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   void SetBlob(const std::string& name, std::shared_ptr<void> data) const;
 
   // Calculate number of oneDNN objects cached
-  unsigned int GetCachedObjectsNumber(void);
+  unsigned int GetCachedObjectsNumber(void) const;
 
   // Find a saved blob. Return nullptr if not found
   std::shared_ptr<void> GetBlob(const std::string& name) const;
@@ -785,7 +793,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   std::shared_ptr<BlobMap> p_blobmap_;
   // Map key is pointer of executor and value is a data(iterator in map) needed
   // to erase
-  std::shared_ptr<ExecMap> p_exec_items_;
+  std::shared_ptr<ExecShape> p_exec_items_;
   std::shared_ptr<std::mutex> p_mutex_;
   bool block_next_cache_clearing_ = false;
 };
diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/fluid/platform/device_memory_aligment.cc
index f8e031104415e8..185646e7327006 100644
--- a/paddle/fluid/platform/device_memory_aligment.cc
+++ b/paddle/fluid/platform/device_memory_aligment.cc
@@ -26,9 +26,11 @@ size_t Alignment(size_t size, const platform::Place &place) {
 #elif defined(PADDLE_WITH_XPU)
     // TODO(wangxi): add XpuMinChunkSize
     alignment = alignment;
+#elif defined(PADDLE_WITH_ASCEND_CL)
+    alignment = NPUMinChunkSize();
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Fluid is not compiled with CUDA."));
+        "Fluid is not compiled with CUDA or NPU."));
 #endif
   }
   size_t remaining = size % alignment;
diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h
index a151e434833587..e0f2f0f11c9c3f 100644
--- a/paddle/fluid/platform/device_memory_aligment.h
+++ b/paddle/fluid/platform/device_memory_aligment.h
@@ -19,6 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/gpu_info.h"
+#elif defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/npu_info.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h
index 77ff3f3ccbbb6e..f72eb6731f6276 100644
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 
-#define MIOPEN_VERSION                                        \
-  (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 100 + \
+#define MIOPEN_VERSION                                       \
+  (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \
    MIOPEN_VERSION_PATCH)  // NOLINT
 
 namespace paddle {
diff --git a/paddle/fluid/platform/eigen_ext.h b/paddle/fluid/platform/eigen_ext.h
index 49bd57f0406d63..09b8c8137fcd1f 100644
--- a/paddle/fluid/platform/eigen_ext.h
+++ b/paddle/fluid/platform/eigen_ext.h
@@ -16,7 +16,6 @@
 
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
@@ -24,7 +23,6 @@
 
 namespace Eigen {
 
-using complex128 = paddle::platform::complex128;
 using float16 = paddle::platform::float16;
 template <typename T>
 using complex = paddle::platform::complex<T>;
@@ -62,28 +60,6 @@ struct NumTraits<paddle::platform::bfloat16>
   }
 };
 
-template <>
-struct NumTraits<complex128> : GenericNumTraits<std::complex<double>> {
-  typedef double Real;
-  typedef typename NumTraits<double>::Literal Literal;
-  enum {
-    IsComplex = 1,
-    RequireInitialization = NumTraits<double>::RequireInitialization,
-    ReadCost = 2 * NumTraits<double>::ReadCost,
-    AddCost = 2 * NumTraits<Real>::AddCost,
-    MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
-  };
-
-  EIGEN_DEVICE_FUNC
-  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
-  EIGEN_DEVICE_FUNC
-  static inline Real dummy_precision() {
-    return NumTraits<Real>::dummy_precision();
-  }
-  EIGEN_DEVICE_FUNC
-  static inline int digits10() { return NumTraits<Real>::digits10(); }
-};
-
 template <>
 struct NumTraits<complex<float>> : GenericNumTraits<std::complex<float>> {
   typedef float Real;
@@ -247,71 +223,6 @@ HOSTDEVICE inline paddle::platform::bfloat16 maxi(
   return a < b ? b : a;
 }
 
-//////////// complex128 methods /////////////
-
-template <>
-HOSTDEVICE inline bool(isnan)(const complex128& a) {
-  return (paddle::platform::isnan)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isinf)(const complex128& a) {
-  return (paddle::platform::isinf)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isfinite)(const complex128& a) {
-  return (paddle::platform::isfinite)(a);
-}
-
-template <>
-HOSTDEVICE inline complex128 exp(const complex128& a) {
-  double com = ::expf(a.real);
-  double res_real = com * ::cosf(a.imag);
-  double res_imag = com * ::sinf(a.imag);
-  return complex128(res_real, res_imag);
-}
-
-template <>
-HOSTDEVICE inline complex128 log(const complex128& a) {
-  return paddle::platform::log(a);
-}
-
-template <>
-HOSTDEVICE inline complex128 tanh(const complex128& a) {
-  return paddle::platform::tanh(a);
-}
-
-template <>
-HOSTDEVICE inline complex128 sqrt(const complex128& a) {
-  return paddle::platform::sqrt(a);
-}
-
-template <>
-HOSTDEVICE inline complex128 ceil(const complex128& a) {
-  return complex128(::ceilf(a.real), ::ceilf(a.imag));
-}
-
-template <>
-HOSTDEVICE inline complex128 floor(const complex128& a) {
-  return complex128(::floorf(a.real), ::floor(a.imag));
-}
-
-template <>
-HOSTDEVICE inline complex128 round(const complex128& a) {
-  return complex128(::roundf(a.real), ::roundf(a.imag));
-}
-
-template <>
-HOSTDEVICE inline complex128 pow(const complex128& a, const complex128& b) {
-  return paddle::platform::pow(a, b);
-}
-
-template <>
-HOSTDEVICE inline double abs(const complex128& a) {
-  return paddle::platform::abs(a);
-}
-
 //////////// complex<float> methods /////////////
 
 template <>
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index d3890de89a5d14..c63ea3fa8573b8 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -775,13 +775,13 @@ inline std::string GetExternalErrorMsg(T status) {
       }
     }
 #else
-    char buf[100];
+    char buf[512];
     MEMORY_BASIC_INFORMATION mbi;
     HMODULE h_module =
         (::VirtualQuery(GetCurrentTraceBackString, &mbi, sizeof(mbi)) != 0)
             ? (HMODULE)mbi.AllocationBase
             : NULL;
-    GetModuleFileName(h_module, buf, 100);
+    GetModuleFileName(h_module, buf, 512);
     std::string strModule(buf);
     const size_t last_slash_idx = strModule.find_last_of("\\");
     std::string compare_path = strModule.substr(strModule.length() - 7);
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 842d4cc139281a..95a852ad6e92a3 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -417,7 +417,7 @@ TEST(enforce, cuda_success) {
       "An unsupported value or parameter was passed to the function (a "
       "negative vector size, for example).To correct: ensure that all the "
       "parameters being passed have valid values"));
-  /*
+
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
   EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
   EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error"));
@@ -430,7 +430,6 @@ TEST(enforce, cuda_success) {
                                      "The call to NCCL is incorrect. This is "
                                      "usually reflecting a programming error"));
 #endif
-*/
 }
 #endif
 #endif
diff --git a/paddle/fluid/platform/gpu_launch_config.h b/paddle/fluid/platform/gpu_launch_config.h
index 6c265677d63e99..4da91b4e764a52 100644
--- a/paddle/fluid/platform/gpu_launch_config.h
+++ b/paddle/fluid/platform/gpu_launch_config.h
@@ -37,6 +37,7 @@ struct GpuLaunchConfig {
   dim3 theory_thread_count = dim3(1, 1, 1);
   dim3 thread_per_block = dim3(1, 1, 1);
   dim3 block_per_grid = dim3(1, 1, 1);
+  int compute_capability = 0;
 };
 
 inline GpuLaunchConfig GetGpuLaunchConfig1D(
@@ -67,11 +68,14 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(
       std::min(max_threads, context.GetMaxThreadsPerBlock());
   const int block_count =
       std::min(DivUp(physical_thread_count, thread_per_block), sm);
+  // Get compute_capability
+  const int capability = context.GetComputeCapability();
 
   GpuLaunchConfig config;
   config.theory_thread_count.x = theory_thread_count;
   config.thread_per_block.x = thread_per_block;
   config.block_per_grid.x = block_count;
+  config.compute_capability = capability;
   return config;
 }
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 5fcb1e30fbe677..5e5475da89f984 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -56,6 +56,7 @@ set(PYBIND_SRCS
   ir.cc
   inference_api.cc
   compatible.cc
+  io.cc
   generator_py.cc)
 
 if(WITH_ASCEND)
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 8a5ad5852aedf5..b2572e5aa4ba15 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -511,6 +511,7 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("disable_trt_plugin_fp16") = false)
       .def("enable_tensorrt_oss", &AnalysisConfig::EnableTensorRtOSS)
       .def("tensorrt_oss_enabled", &AnalysisConfig::tensorrt_oss_enabled)
+      .def("exp_disable_tensorrt_ops", &AnalysisConfig::Exp_DisableTensorRtOPs)
       .def("enable_tensorrt_dla", &AnalysisConfig::EnableTensorRtDLA,
            py::arg("dla_core") = 0)
       .def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled)
diff --git a/paddle/fluid/pybind/io.cc b/paddle/fluid/pybind/io.cc
new file mode 100644
index 00000000000000..fc49f76305461f
--- /dev/null
+++ b/paddle/fluid/pybind/io.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/pybind/io.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/pybind/pybind_boost_headers.h"
+
+namespace py = pybind11;
+namespace paddle {
+namespace pybind {
+
+void BindIO(pybind11::module *m) {
+  m->def("save_lod_tensor", [](const paddle::framework::LoDTensor &tensor,
+                               const std::string &str_file_name) {
+    std::ofstream fout(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(static_cast<bool>(fout), true,
+                      platform::errors::Unavailable(
+                          "Cannot open %s to save variables.", str_file_name));
+    paddle::framework::SerializeToStream(fout, tensor);
+
+    int64_t tellp = fout.tellp();
+    fout.close();
+    return tellp;
+  });
+
+  m->def("load_lod_tensor", [](paddle::framework::LoDTensor &tensor,
+                               const std::string &str_file_name) {
+    std::ifstream fin(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(static_cast<bool>(fin), true,
+                      platform::errors::Unavailable(
+                          "Cannot open %s to load variables.", str_file_name));
+
+    paddle::framework::DeserializeFromStream(fin, &tensor);
+    int64_t tellg = fin.tellg();
+    fin.close();
+    return tellg;
+  });
+
+  m->def("save_selected_rows",
+         [](const paddle::framework::SelectedRows &selected_rows,
+            const std::string &str_file_name) {
+           std::ofstream fout(str_file_name, std::ios::binary);
+           PADDLE_ENFORCE_EQ(
+               static_cast<bool>(fout), true,
+               platform::errors::Unavailable(
+                   "Cannot open %s to save SelectedRows.", str_file_name));
+
+           paddle::framework::SerializeToStream(fout, selected_rows);
+           int64_t tellp = fout.tellp();
+           fout.close();
+           return tellp;
+         });
+
+  m->def("load_selected_rows",
+         [](paddle::framework::SelectedRows &selected_rows,
+            const std::string &str_file_name) {
+           std::ifstream fin(str_file_name, std::ios::binary);
+           PADDLE_ENFORCE_EQ(
+               static_cast<bool>(fin), true,
+               platform::errors::Unavailable(
+                   "Cannot open %s to load SelectedRows.", str_file_name));
+
+           paddle::framework::DeserializeFromStream(fin, &selected_rows);
+           int64_t tellg = fin.tellg();
+           fin.close();
+           return tellg;
+         });
+
+  m->def("save_lod_tensor_to_memory",
+         [](const paddle::framework::LoDTensor &tensor) -> py::bytes {
+           std::ostringstream ss;
+           paddle::framework::SerializeToStream(ss, tensor);
+           return ss.str();
+         });
+
+  m->def("load_lod_tensor_from_memory", [](paddle::framework::LoDTensor &tensor,
+                                           const std::string &tensor_bytes) {
+    std::istringstream fin(tensor_bytes, std::ios::in | std::ios::binary);
+    paddle::framework::DeserializeFromStream(fin, &tensor);
+  });
+
+  m->def("save_selected_rows_to_memory",
+         [](const paddle::framework::SelectedRows &selected_rows) -> py::bytes {
+           std::ostringstream ss;
+           paddle::framework::SerializeToStream(ss, selected_rows);
+           return ss.str();
+         });
+
+  m->def("load_selected_rows_from_memory",
+         [](paddle::framework::SelectedRows &selected_rows,
+            const std::string &selected_rows_bytes) {
+           std::istringstream fin(selected_rows_bytes,
+                                  std::ios::in | std::ios::binary);
+           paddle::framework::DeserializeFromStream(fin, &selected_rows);
+         });
+}
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/operators/log_loss_op.cu b/paddle/fluid/pybind/io.h
similarity index 53%
rename from paddle/fluid/operators/log_loss_op.cu
rename to paddle/fluid/pybind/io.h
index 280913c43a2749..dfe3154cb95da5 100644
--- a/paddle/fluid/operators/log_loss_op.cu
+++ b/paddle/fluid/pybind/io.h
@@ -1,21 +1,24 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/log_loss_op.h"
 
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    log_loss_grad,
-    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
+#pragma once
+
+#include <Python.h>
+#include "paddle/fluid/pybind/pybind_boost_headers.h"
+
+namespace paddle {
+namespace pybind {
+void BindIO(pybind11::module* m);
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index bf3c77843219c7..6278a23cea6440 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -65,6 +65,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}},
     {"momentum", {"Param", "Grad", "Velocity", "LearningRate"}},
     {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
+    {"run_program", {"X", "Params"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -98,6 +99,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
     {"lamb",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
+    {"run_program", {"DOut"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
@@ -148,6 +150,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"lamb",
      {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}},
     {"rnn", {"DropoutState"}},
+    {"run_program", {"Out", "DOut", "OutScope"}},
 };
 
 // NOTE(pangyoki): Tensor View Strategy.
@@ -173,7 +176,7 @@ std::set<std::string> inplace_op_duplicable_ins_set = {
 
 // clang-format off
 const char* OUT_INITIALIZER_TEMPLATE =
-    R"({"%s", {std::shared_ptr<imperative::VarBase>(new imperative::VarBase(tracer->GenerateUniqueName()))}})";
+    R"({"%s", {std::shared_ptr<imperative::VarBase>(new imperative::VarBase("auto_"+std::to_string(VarBaseUniqueNameID++)+"_"))}})";
 const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableOutput(%s)})";
 
 const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})";
@@ -255,12 +258,11 @@ R"(
   ConstructAttrMapFromPyArgs("%s", %d, &attrs, args);
   {
     py::gil_scoped_release release;
-    auto tracer = imperative::GetCurrentTracer();
     %s
     imperative::NameVarBaseMap outs = %s;
     imperative::NameVarBaseMap ins = %s;
     %s
-    tracer->TraceOp("%s", ins, outs, attrs, {%s});
+    imperative::GetCurrentTracer()->TraceOp("%s", ins, outs, attrs, {%s});
     return %s;
   }
 })";
@@ -585,7 +587,8 @@ int main(int argc, char* argv[]) {
   out << "namespace py = pybind11;"
       << "\n";
   out << "namespace paddle {\n"
-      << "namespace pybind {\n";
+      << "namespace pybind {\n\n";
+  out << "std::atomic<int> VarBaseUniqueNameID{0};\n";
   out << paddle::string::join_strings(std::get<0>(op_funcs), '\n');
   out << "\n\n";
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 6dd08e5dfa4bf2..86084297c4ae65 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -68,6 +68,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/monitor.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/pybind/io.h"
 #ifdef PADDLE_WITH_ASCEND
 #include "paddle/fluid/pybind/ascend_wrapper_py.h"
 #endif
@@ -496,70 +497,6 @@ PYBIND11_MODULE(core_noavx, m) {
 #endif
     return tensor;
   });
-  m.def("_save_lod_tensor", [](const LoDTensor &tensor,
-                               const std::string &str_file_name) {
-    std::ofstream fout(str_file_name, std::ios::binary);
-    PADDLE_ENFORCE_EQ(static_cast<bool>(fout), true,
-                      platform::errors::Unavailable(
-                          "Cannot open %s to save variables.", str_file_name));
-    SerializeToStream(fout, tensor);
-
-    int64_t tellp = fout.tellp();
-    fout.close();
-    return tellp;
-  });
-  m.def("_load_lod_tensor", [](LoDTensor &tensor,
-                               const std::string &str_file_name) {
-    std::ifstream fin(str_file_name, std::ios::binary);
-    PADDLE_ENFORCE_EQ(static_cast<bool>(fin), true,
-                      platform::errors::Unavailable(
-                          "Cannot open %s to load variables.", str_file_name));
-
-    DeserializeFromStream(fin, &tensor);
-    int64_t tellg = fin.tellg();
-    fin.close();
-    return tellg;
-  });
-  m.def("_save_selected_rows", [](const SelectedRows &selected_rows,
-                                  const std::string &str_file_name) {
-    std::ofstream fout(str_file_name, std::ios::binary);
-    PADDLE_ENFORCE_EQ(
-        static_cast<bool>(fout), true,
-        platform::errors::Unavailable("Cannot open %s to save SelectedRows.",
-                                      str_file_name));
-
-    SerializeToStream(fout, selected_rows);
-    int64_t tellp = fout.tellp();
-    fout.close();
-    return tellp;
-  });
-  m.def("_load_selected_rows",
-        [](SelectedRows &selected_rows, const std::string &str_file_name) {
-          std::ifstream fin(str_file_name, std::ios::binary);
-          PADDLE_ENFORCE_EQ(
-              static_cast<bool>(fin), true,
-              platform::errors::Unavailable(
-                  "Cannot open %s to load SelectedRows.", str_file_name));
-
-          DeserializeFromStream(fin, &selected_rows);
-          int64_t tellg = fin.tellg();
-          fin.close();
-          return tellg;
-        });
-  m.def("_save_static_dict",
-        [](const std::string &str_file_name, const py::handle &vec_var_list,
-           const Scope &scope) {
-          std::vector<std::string> vec_name_list = GetNameList(vec_var_list);
-          SaveStaticNameListToDisk(str_file_name, vec_name_list, scope);
-        });
-
-  m.def("_load_static_dict",
-        [](const std::string &str_file_name, const py::handle &vec_var_list,
-           const Scope &scope, const Executor *executor) {
-          std::vector<std::string> vec_name_list = GetNameList(vec_var_list);
-          CreateVariableIfNotExit(vec_var_list, scope, executor);
-          LoadStaticNameListFromDisk(str_file_name, vec_name_list, scope);
-        });
 
   m.def("_create_loaded_parameter",
         [](const py::handle &vec_var_list, const Scope &scope,
@@ -567,26 +504,6 @@ PYBIND11_MODULE(core_noavx, m) {
           CreateVariableIfNotExit(vec_var_list, scope, executor);
         });
 
-  m.def("_save_dygraph_dict", [](const std::string &str_file_name,
-                                 const PyNameVarBaseMap &state_dict) {
-    auto vec_var_base_list = GetVarBaseList(state_dict);
-
-    SaveDygraphVarBaseListToDisk(str_file_name, vec_var_base_list);
-  });
-
-  m.def("_load_dygraph_dict", [](const std::string &str_file_name) {
-    auto load_tensor = LoadDygraphVarBaseListFromDisk(str_file_name);
-
-    std::unordered_map<std::string, std::shared_ptr<imperative::VarBase>>
-        map_output;
-
-    for (size_t i = 0; i < load_tensor.size(); ++i) {
-      map_output.emplace(load_tensor[i]->Name(), load_tensor[i]);
-    }
-
-    return map_output;
-  });
-
   m.def("save_op_version_info", [](framework::ProgramDesc &desc) {
     framework::compatible::pb::OpVersionMap pb_vmap{desc.OpVersionMap()};
     framework::compatible::SaveOpVersions(
@@ -3111,6 +3028,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("device_count", &ParallelExecutor::DeviceCount);
 
   BindFleetWrapper(&m);
+  BindIO(&m);
 
 #ifdef PADDLE_WITH_PSLIB
   BindHeterWrapper(&m);
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 8c323490cc964c..c4a93f0d4a1e9f 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -22,11 +22,12 @@ setlocal enabledelayedexpansion
 
 rem -------clean up environment-----------
 set work_dir=%cd%
-set cache_dir=%work_dir:Paddle=cache%
+if not defined cache_dir set cache_dir=%work_dir:Paddle=cache%
 if not exist %cache_dir%\tools (
     git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
 )
 taskkill /f /im cmake.exe  2>NUL
+taskkill /f /im ninja.exe  2>NUL
 taskkill /f /im MSBuild.exe 2>NUL
 taskkill /f /im cl.exe 2>NUL
 taskkill /f /im lib.exe 2>NUL
@@ -217,7 +218,8 @@ set CUDA_ARCH_NAME=All
 
 call :cmake || goto cmake_error
 call :build || goto build_error
-call :zip_file || goto zip_file_error
+call :zip_cc_file || goto zip_cc_file_error
+call :zip_c_file || goto zip_c_file_error
 goto:success
 
 rem "Other configurations are added here"
@@ -689,7 +691,7 @@ goto:eof
 exit /b 1
 
 rem ---------------------------------------------------------------------------------------------
-:zip_file
+:zip_cc_file
 tree /F %cd%\paddle_inference_install_dir\paddle
 if exist paddle_inference.zip del paddle_inference.zip
 python -c "import shutil;shutil.make_archive('paddle_inference', 'zip', root_dir='paddle_inference_install_dir')"
@@ -701,10 +703,27 @@ for /F %%i in ("%libsize%") do (
 )
 goto:eof
 
-:zip_file_error
+:zip_cc_file_error
 echo Tar inference library failed!
 exit /b 1
 
+rem ---------------------------------------------------------------------------------------------
+:zip_c_file
+tree /F %cd%\paddle_inference_c_install_dir\paddle
+if exist paddle_inference_c.zip del paddle_inference_c.zip
+python -c "import shutil;shutil.make_archive('paddle_inference_c', 'zip', root_dir='paddle_inference_c_install_dir')"
+%cache_dir%\tools\busybox64.exe du -h -k paddle_inference_c.zip > lib_size.txt
+set /p libsize=< lib_size.txt
+for /F %%i in ("%libsize%") do (
+    set /a libsize_m=%%i/1024
+    echo "Windows Paddle_Inference CAPI ZIP Size: !libsize_m!M"
+)
+goto:eof
+
+:zip_c_file_error
+echo Tar inference capi library failed!
+exit /b 1
+
 :timestamp
 setlocal enabledelayedexpansion
 @ECHO OFF
@@ -763,6 +782,7 @@ echo    ========================================
 echo    Clean up environment  at the end ...
 echo    ========================================
 taskkill /f /im cmake.exe  2>NUL
+taskkill /f /im ninja.exe  2>NUL
 taskkill /f /im MSBuild.exe 2>NUL
 taskkill /f /im git.exe 2>NUL
 taskkill /f /im cl.exe 2>NUL
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 2eda3d04f81c7c..96dc8c67969458 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -426,6 +426,13 @@ EOF
         buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference.tgz |awk '{print $1}')
         echo "Paddle_Inference Size: $buildSize"
         echo "ipipe_log_param_Paddle_Inference_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt
+    elif [ "$1" == "paddle_inference_c" ]; then
+        cd ${PADDLE_ROOT}/build
+        cp -r paddle_inference_c_install_dir paddle_inference_c
+        tar -czf paddle_inference_c.tgz paddle_inference_c
+        buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference_c.tgz |awk '{print $1}')
+        echo "Paddle_Inference Capi Size: $buildSize"
+        echo "ipipe_log_param_Paddle_Inference_capi_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt
     else
         SYSTEM=`uname -s`
         if [ "$SYSTEM" == "Darwin" ]; then
@@ -1234,21 +1241,21 @@ set +x
                 fi
 
                 if [[ "$is_exclusive" != "" ]]; then
-                    if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then
+                    if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
                         exclusive_tests_two_parallel="$exclusive_tests_two_parallel|^$testcase$"
                     else
                         exclusive_tests_non_parallel="$exclusive_tests_non_parallel|^$testcase$"
                     fi
                 elif [[ "$is_multicard" != "" ]]; then
-                    if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then
+                    if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
                         multiple_card_tests_two_parallel="$multiple_card_tests_two_parallel|^$testcase$"
                     else
                         multiple_card_tests_non_parallel="$multiple_card_tests_non_parallel|^$testcase$"
                     fi
                 else
-                    if [[ $(echo $cpu_parallel_job | grep -o $testcase) != "" ]]; then
+                    if [[ $(echo $cpu_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
                         single_card_tests_high_parallel="$single_card_tests_high_parallel|^$testcase$"
-                    elif [[ $(echo $tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then
+                    elif [[ $(echo $tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
                         single_card_tests_two_parallel="$single_card_tests_two_parallel|^$testcase$"
                     else
                         single_card_tests_non_parallel="$single_card_tests_non_parallel|^$testcase$"
@@ -1941,6 +1948,7 @@ EOF
     echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
 
     build_size "paddle_inference"
+    build_size "paddle_inference_c"
 }
 
 function tar_fluid_lib() {
@@ -2001,12 +2009,16 @@ function build_document_preview() {
     sh /paddle/tools/document_preview.sh ${PORT}
 }
 
-
-function example() {
+# origin name: example
+function exec_samplecode_test() {
     pip install ${PADDLE_ROOT}/build/python/dist/*.whl
     paddle version
     cd ${PADDLE_ROOT}/tools
-    python sampcd_processor.py cpu;example_error=$?
+    if [ "$1" = "cpu" ] ; then
+        python sampcd_processor.py cpu; example_error=$?
+    elif [ "$1" = "gpu" ] ; then
+        python sampcd_processor.py --threads=16 --full-test gpu; example_error=$?
+    fi
     if [ "$example_error" != "0" ];then
       echo "Code instance execution failed" >&2
       exit 5
@@ -2119,7 +2131,7 @@ function main() {
         check_sequence_op_unittest
         generate_api_spec ${PYTHON_ABI:-""} "PR"
         set +e
-        example_info=$(example)
+        example_info=$(exec_samplecode_test cpu)
         example_code=$?
         summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info"
         assert_api_spec_approvals
@@ -2182,6 +2194,17 @@ function main() {
         check_coverage
         check_change_of_unittest ${PYTHON_ABI:-""}
         ;;
+      cpu_cicheck_coverage)
+        check_approvals_of_unittest 1
+        check_diff_file_for_coverage
+        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        enable_unused_var_check
+        ;;
+      gpu_cicheck_coverage)
+        parallel_test
+        check_coverage
+        check_change_of_unittest ${PYTHON_ABI:-""}
+        ;;
       ci_preciseTest)
         insert_pile_to_h_cu_diff 
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
@@ -2267,7 +2290,11 @@ function main() {
         build_document_preview
         ;;
       api_example)
-        example
+        example_info=$(exec_samplecode_test cpu)
+        example_code=$?
+        check_style_code=0
+        check_style_info=
+        summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info"
         ;;
       test_op_benchmark)
         test_op_benchmark
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 7bac330376c44f..b0f0f326bd715e 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -72,6 +72,7 @@
 from .tensor.attribute import imag  # noqa: F401
 from .tensor.creation import to_tensor  # noqa: F401
 from .tensor.creation import diag  # noqa: F401
+from .tensor.creation import diagflat  # noqa: F401
 from .tensor.creation import eye  # noqa: F401
 from .tensor.creation import linspace  # noqa: F401
 from .tensor.creation import ones  # noqa: F401
@@ -206,6 +207,8 @@
 from .tensor.math import prod  # noqa: F401
 from .tensor.math import broadcast_shape  # noqa: F401
 from .tensor.math import conj  # noqa: F401
+from .tensor.math import neg  # noqa: F401
+from .tensor.math import lgamma  # noqa: F401
 
 from .tensor.random import multinomial  # noqa: F401
 from .tensor.random import standard_normal  # noqa: F401
@@ -301,6 +304,7 @@
            'add',
            'subtract',
            'diag',
+           'diagflat',
            'isnan',
            'scatter_nd_add',
            'unstack',
@@ -422,6 +426,8 @@
            'prod',
            'broadcast_shape',
            'conj',
+           'neg',
+           'lgamma',
            'square',
            'divide',
            'ceil',
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index 2a476f63862cfa..b712729f6420d0 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -25,6 +25,7 @@
 import paddle.dataset
 import six.moves.cPickle as pickle
 import glob
+import paddle
 
 __all__ = []
 
@@ -95,16 +96,19 @@ def download(url, module_name, md5sum, save_name=None):
                     chunk_size = 4096
                     total_length = int(total_length)
                     total_iter = total_length / chunk_size + 1
-                    log_interval = total_iter / 20 if total_iter > 20 else 1
+                    log_interval = total_iter // 20 if total_iter > 20 else 1
                     log_index = 0
+                    bar = paddle.hapi.progressbar.ProgressBar(
+                        total_iter, name='item')
                     for data in r.iter_content(chunk_size=chunk_size):
                         if six.PY2:
                             data = six.b(data)
                         f.write(data)
                         log_index += 1
+                        bar.update(log_index, {})
                         if log_index % log_interval == 0:
-                            sys.stderr.write(".")
-                        sys.stdout.flush()
+                            bar.update(log_index)
+
         except Exception as e:
             # re-try
             continue
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
index c20672c2ce1577..493a94e45d462a 100644
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -93,7 +93,7 @@ def batch_images_from_tar(data_file,
     :rtype: string
     """
     batch_dir = data_file + "_batch"
-    out_path = "%s/%s" % (batch_dir, dataset_name)
+    out_path = "%s/%s_%s" % (batch_dir, dataset_name, os.getpid())
     meta_file = "%s/%s.txt" % (batch_dir, dataset_name)
 
     if os.path.exists(out_path):
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 4f3a6f4768933d..0ffb1d9f881ba1 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -239,31 +239,37 @@ def new_group(ranks=None, backend=None):
     if global_rank not in ranks:
         gp = Group(-1, -1, ring_id, ranks)
         _group_map[ring_id] = gp
-        return gp
-
-    ranks = sorted(ranks)
-    group_rank = ranks.index(global_rank)
-    group_size = len(ranks)
-    gp = Group(group_rank, group_size, ring_id, ranks)
-    _group_map[ring_id] = gp
-
-    if group_size < 2:
-        return gp
-
-    strategy = core.ParallelStrategy()
-    strategy.nranks = group_size
-    strategy.local_rank = group_rank
-    strategy.trainer_endpoints = [genv.trainer_endpoints[i] for i in ranks]
-    strategy.current_endpoint = genv.current_endpoint
-    strategy.nrings = 1
-
-    if core.is_compiled_with_cuda():
-        place = core.CUDAPlace(genv.device_id)
-        core.NCCLParallelContext(strategy, place).init_with_ring_id(ring_id)
     else:
-        assert False, ("no cuda device found")
-    # need to barrier to construct group
-    barrier(gp)
+        ranks = sorted(ranks)
+        group_rank = ranks.index(global_rank)
+        group_size = len(ranks)
+        gp = Group(group_rank, group_size, ring_id, ranks)
+        _group_map[ring_id] = gp
+
+        if group_size >= 2:
+            strategy = core.ParallelStrategy()
+            strategy.nranks = group_size
+            strategy.local_rank = group_rank
+            strategy.trainer_endpoints = [
+                genv.trainer_endpoints[i] for i in ranks
+            ]
+            strategy.current_endpoint = genv.current_endpoint
+            strategy.nrings = 1
+
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(genv.device_id)
+                core.NCCLParallelContext(strategy,
+                                         place).init_with_ring_id(ring_id)
+            else:
+                assert False, ("no cuda device found")
+        else:
+            return gp
+
+    # TODO(shenliang03): This is a temporary solution to solve the problem of 
+    # hang caused by cross-creation of new_group
+    tmp = fill_constant([0], dtype="int32", value="1")
+    paddle.distributed.all_reduce(tmp, use_calc_stream=True)
+    paddle.distributed.wait(tmp)
     return gp
 
 
@@ -775,7 +781,7 @@ def _c_identity(tensor, group=None):
     return out
 
 
-def _c_concat(tensor, nranks, group=None):
+def _c_concat(tensor, group=None):
     """
     Return allgather of the tensor, mainly used with model parallel.
 
@@ -791,10 +797,14 @@ def _c_concat(tensor, nranks, group=None):
         return
     ring_id = 0 if group is None else group.id
 
+    global_rank = _get_global_env().rank
+    rank = global_rank if group is None else group.get_group_rank(global_rank)
+    nranks = _get_global_env().world_size if group is None else group.nranks
+
     if in_dygraph_mode():
         return core.ops.c_concat(tensor, 'ring_id', ring_id, 'use_calc_stream',
-                                 True, 'nranks', nranks, 'use_model_parallel',
-                                 True)
+                                 True, 'rank', rank, 'nranks', nranks,
+                                 'use_model_parallel', True)
 
     op_type = 'c_concat'
     helper = LayerHelper(op_type, **locals())
@@ -812,12 +822,13 @@ def _c_concat(tensor, nranks, group=None):
             'ring_id': ring_id,
             'use_calc_stream': True,
             'use_model_parallel': True,
-            'nranks': nranks
+            'nranks': nranks,
+            'rank': rank
         })
     return out
 
 
-def _c_split(tensor, rank, nranks, group=None):
+def _c_split(tensor, group=None):
     """
     Split tensor evenly among all members, mainly used with model parallel.
 
@@ -834,6 +845,10 @@ def _c_split(tensor, rank, nranks, group=None):
         return
     ring_id = 0 if group is None else group.id
 
+    global_rank = _get_global_env().rank
+    rank = global_rank if group is None else group.get_group_rank(global_rank)
+    nranks = _get_global_env().world_size if group is None else group.nranks
+
     if in_dygraph_mode():
         return core.ops.c_split(tensor, 'use_calc_stream', True, 'ring_id',
                                 ring_id, 'rank', rank, 'nranks', nranks,
@@ -879,8 +894,56 @@ def _mp_allreduce(tensor,
                 "use_model_parallel", use_model_parallel)
         else:
             raise ValueError("Unknown parameter: {}.".format(op))
-    else:
-        raise NotImplementedError("No support _mp_allreduce in dygraph mode.")
+
+    op_type = 'c_allreduce_sum'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        op_type)
+
+    helper.append_op(
+        type=op_type,
+        inputs={'X': tensor},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': use_calc_stream,
+            'use_model_parallel': use_model_parallel,
+        })
+    return out
+
+
+def _c_lookup_table(table, index, start_index=0, name=None):
+    """
+    Lookup table according to index.
+
+    Args:
+        table (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64.
+        index (Tensor): The index to lookup table.
+        start_index (int): The initial index for table range.
+        name (string): The name of the api
+
+    Returns:
+        Tensor.
+    """
+    if in_dygraph_mode():
+        return core.ops.c_embedding(table, index, "start_index", start_index)
+
+    op_type = 'c_embedding'
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype(input_param_name='table')
+    check_variable_and_dtype(index, 'input', ['int32', 'int64'], op_type)
+    tmp = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='c_embedding',
+        inputs={'Ids': index,
+                'W': table},
+        outputs={'Out': tmp},
+        attrs={"start_index": start_index})
+    return tmp
 
 
 class _Linear(layers.Layer):
@@ -921,6 +984,35 @@ def extra_repr(self):
             self.weight.shape[0], self.weight.shape[1], self._dtype, name_str)
 
 
+def _c_softmax_with_cross_entropy(logits,
+                                  label,
+                                  group=None,
+                                  return_softmax=False):
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+    global_rank = _get_global_env().rank
+    rank = global_rank if group is None else group.get_group_rank(global_rank)
+    nranks = _get_global_env().world_size if group is None else group.nranks
+
+    input_dims = len(list(logits.shape))
+    label_dims = len(list(label.shape))
+    if input_dims - 1 != label_dims and input_dims != label_dims:
+        raise ValueError(
+            'Expected nput_dims - 1 = label_dims or input_dims == label_dims\
+             (got nput_dims{}, label_dims{})'.format(input_dims, label_dims))
+    if input_dims - 1 == label_dims:
+        label = paddle.unsqueeze(label, axis=-1)
+
+    if in_dygraph_mode():
+        softmax, loss = core.ops.c_softmax_with_cross_entropy(
+            logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks)
+        if not return_softmax:
+            return loss
+        else:
+            return loss, softmax
+
+
 def _linear(x, weight, bias=None, name=None):
     """
     Fuction Linear
@@ -989,7 +1081,7 @@ def _parallel_linear(x,
 
     if axis == 0:
         if split_tensor:
-            x = _c_split(x, inner_rank, nranks, group=group)
+            x = _c_split(x, group=group)
     else:
         x = _c_identity(x, group=group)
 
@@ -1009,16 +1101,18 @@ def _parallel_linear(x,
             name=name)
 
     linear_out = linear(x)
-    startup_block = paddle.static.default_startup_program().global_block()
-    main_block = paddle.static.default_main_program().global_block()
-    startup_block.vars[linear.weight.name].is_distributed = True
-    main_block.vars[linear.weight.name].is_distributed = True
+    startup_block = paddle.static.default_startup_program().current_block()
+    main_block = paddle.static.default_main_program().current_block()
+    startup_block._find_var_recursive(linear.weight.name).is_distributed = True
+    main_block._find_var_recursive(linear.weight.name).is_distributed = True
+
     # set is_distributed for splited bias
     # if a linear layer is splited by row, each rank would hold a complete bias and they should be the same in each rank.
     # if a linear layer is splited by col, the bias would also be split into each rank as its weight
     if axis == 1 and linear._bias_attr != False:
-        startup_block.vars[linear.bias.name].is_distributed = True
-        main_block.vars[linear.bias.name].is_distributed = True
+        startup_block._find_var_recursive(
+            linear.bias.name).is_distributed = True
+        main_block._find_var_recursive(linear.bias.name).is_distributed = True
 
     if not gather_out: return linear_out
 
@@ -1072,47 +1166,34 @@ def _parallel_embedding(x,
         return
     ring_id = 0 if group is None else group.id
 
-    origin_num_embeddings = origin_size[0]
-    embedding = paddle.nn.Embedding(
-        per_part_embeddings,
-        origin_size[1],
-        padding_idx=per_part_embeddings - 1,
-        sparse=False,
-        weight_attr=param_attr,
-        name=name)
-
-    origin_input_shape = x.shape
-    if len(origin_input_shape) == 2:
-        x = paddle.unsqueeze(x, axis=-1)
-    else:
-        assert origin_input_shape[-1] == 1, (
-            "The last dimension size of x must be 1.")
-    x_shard = paddle.shard_index(x, origin_num_embeddings, num_partitions,
-                                 inner_rank, per_part_embeddings - 1)
-    if len(origin_input_shape) == 2:
-        x_shard = paddle.squeeze(x_shard, axis=-1)
-    emb_out = embedding(x_shard)
+    helper = LayerHelper("_parallel_embedding", **locals())
+
+    per_part_size = per_part_embeddings
+    rank = inner_rank
+
+    vocab_start_index = rank * per_part_size
+    dtype = helper.get_default_dtype()
+    size = [per_part_size, origin_size[1]]
+
+    weight = helper.create_parameter(
+        attr=param_attr, shape=size, dtype=dtype, is_bias=False)
+
+    if num_partitions == 1:
+        return paddle.nn.functional.embedding(
+            x, weight=weight, padding_idx=None, sparse=False, name=name)
+
     startup_block = paddle.static.default_startup_program().global_block()
     main_block = paddle.static.default_main_program().global_block()
-    startup_block.vars[embedding.weight.name].is_distributed = True
-    main_block.vars[embedding.weight.name].is_distributed = True
-    out = main_block.create_var(
-        shape=emb_out.shape,
-        dtype=emb_out.dtype,
-        type=emb_out.type,
-        lod_level=emb_out.lod_level,
-        persistable=False,
-        is_data=False,
-        need_check_feed=emb_out.desc.need_check_feed())
-    main_block.append_op(
-        type='c_allreduce_sum',
-        inputs={'X': emb_out},
-        outputs={'Out': out},
-        attrs={
-            'ring_id': ring_id,
-            'use_calc_stream': True,
-            'use_model_parallel': True
-        })
+    startup_block.vars[weight.name].is_distributed = True
+    main_block.vars[weight.name].is_distributed = True
+
+    output_parallel = paddle.distributed.collective._c_lookup_table(
+        weight, x, start_index=vocab_start_index, name=name)
+    out = paddle.distributed.collective._mp_allreduce(
+        output_parallel,
+        group=group,
+        use_calc_stream=True,
+        use_model_parallel=True)
     return out
 
 
@@ -1224,11 +1305,11 @@ def split(x,
     if operation == "embedding":
         assert axis == 0, ("We only support to split the weight of embedding "
                            "along the first axis now.")
-        per_part_size = (size[0] + num_partitions - 1) // num_partitions
-        last_part_size = size[0] - per_part_size * (num_partitions - 1)
-        if inner_rank == num_partitions - 1: per_part_size = last_part_size
-        per_part_size += 1  # make the last row as the padding index
+        assert size[0] % num_partitions == 0, \
+            "The length of the vocabulary must be divisible by num_partitions " \
+            "but received vocabulary={} num_partitions={}".format(size[0], num_partitions)
 
+        per_part_size = size[0] // num_partitions
         emb_out = _parallel_embedding(
             x,
             per_part_size,
diff --git a/python/paddle/distributed/fleet/ascend_utils.py b/python/paddle/distributed/fleet/ascend_utils.py
index 27437c50fad66a..2f6c210165ec15 100644
--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ b/python/paddle/distributed/fleet/ascend_utils.py
@@ -80,8 +80,9 @@ def _get_ascend_rankfile(rank_table_file_path):
             nodes = os.getenv("DLS_TASK_NUMBER", None)
             assert nodes is not None, "DLS_TASK_NUMBER didn't set!"
             for node in range(int(nodes)):
-                node_ip = os.getenv(f"VC_CUSTOM{node}_HOSTS", None)
-                assert node_ip is not None, f"VC_CUSTOM{node}_HOSTS didn't set!"
+                node_ip = os.getenv("VC_CUSTOM{}_HOSTS".format(node), None)
+                assert node_ip is not None, "VC_CUSTOM{}_HOSTS didn't set!".format(
+                    node)
                 node_ips.append(node_ip)
             return node_ips, device_count
         node_ips.append(server['server_id'])
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 0a989fe90f96a6..e44a0e0459d31f 100644
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -14,7 +14,7 @@
 
 import paddle
 from paddle.distributed.fleet.proto import distributed_strategy_pb2
-from paddle.fluid.framework import Variable, set_flags, core
+from paddle.fluid.framework import Variable, set_flags, core, _global_flags
 from paddle.fluid.wrapped_decorator import wrap_decorator
 import google.protobuf.text_format
 import google.protobuf
@@ -121,18 +121,18 @@ def __init__(self):
 
         # Set the default values of the following flags to the ones set by users
         key = 'FLAGS_cudnn_batchnorm_spatial_persistent'
-        if core.globals().is_public(key):
+        if _global_flags().is_public(key):
             self.strategy.cudnn_batchnorm_spatial_persistent = bool(
-                core.globals()[key])
+                _global_flags()[key])
         key = 'FLAGS_conv_workspace_size_limit'
-        if core.globals().is_public(key):
-            self.strategy.conv_workspace_size_limit = int(core.globals()[key])
+        if _global_flags().is_public(key):
+            self.strategy.conv_workspace_size_limit = int(_global_flags()[key])
         key = 'FLAGS_cudnn_exhaustive_search'
-        if core.globals().is_public(key):
-            self.strategy.cudnn_exhaustive_search = bool(core.globals()[key])
+        if _global_flags().is_public(key):
+            self.strategy.cudnn_exhaustive_search = bool(_global_flags()[key])
         key = 'FLAGS_sync_nccl_allreduce'
-        if core.globals().is_public(key):
-            self.strategy.sync_nccl_allreduce = bool(core.globals()[key])
+        if _global_flags().is_public(key):
+            self.strategy.sync_nccl_allreduce = bool(_global_flags()[key])
 
         self.__lock_attr = True
 
@@ -286,7 +286,7 @@ def a_sync(self, flag):
             self.a_sync_configs = {"k_steps": 0}
         else:
             raise ValueError(
-                "The type of `flag` is invalid, expected type is bool, but received %s".
+                "The type of `flag` is invalid, expected type is bool, but received {}".
                 format(type(flag)))
 
     @property
@@ -853,6 +853,27 @@ def without_graph_optimization(self, flag):
                 "WARNING: without_graph_optimization should have value of bool type"
             )
 
+    @property
+    def fuse_grad_size_in_num(self):
+        """
+        This based on raw_program_optimizer program and allreduce the num of the fused op
+        Examples:
+          .. code-block:: python
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.fuse_grad_size_in_num = 2
+        """
+        return self.strategy.fuse_grad_size_in_num
+
+    @fuse_grad_size_in_num.setter
+    @is_strict_auto
+    def fuse_grad_size_in_num(self, num):
+        if isinstance(num, int):
+            self.strategy.fuse_grad_size_in_num = num
+        else:
+            print(
+                "WARNING: fuse_grad_size_in_num should have value of int32 type")
+
     @property
     def pipeline(self):
         """
@@ -1561,8 +1582,8 @@ def _enable_env(self):
         ]
 
         for i, key in enumerate(keys):
-            if core.globals().is_public(key):
-                core.globals()[key] = values[i]
+            if _global_flags().is_public(key):
+                _global_flags()[key] = values[i]
 
     def _is_strict_auto(self):
         global non_auto_func_called
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index c69b21538b61ad..ee5eb807fad701 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -83,7 +83,7 @@ def __eq__(self, cluster):
     def __ne__(self, cluster):
         return not self.__eq__(cluster)
 
-    def update_pods(cluster):
+    def update_pods(self, cluster):
         self.pods = copy.copy(cluster.pods)
 
     def trainers_nranks(self):
@@ -195,7 +195,7 @@ def __eq__(self, pod):
                 self.id != pod.id or \
                 self.addr != pod.addr or \
                 self.port != pod.port:
-            logger.debug("pod {} != pod".format(self, pod))
+            logger.debug("pod {} != {}".format(self, pod))
             return False
 
         if len(self.trainers) != len(pod.trainers):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index 707284a784c38e..9e891062bcbccb 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -77,9 +77,12 @@ def _init_communicator(self,
                            wait_port,
                            global_ring_id=None,
                            sync=True):
-        nranks = len(endpoints)
-        other_endpoints = endpoints[:]
-        other_endpoints.remove(current_endpoint)
+        # if current_endpoint is None, it means just for sync,
+        # no group is created.
+        if current_endpoint:
+            nranks = len(endpoints)
+            other_endpoints = endpoints[:]
+            other_endpoints.remove(current_endpoint)
 
         if rank == 0 and wait_port:
             wait_server_ready(other_endpoints)
@@ -117,6 +120,12 @@ def _add_sync_by_allreduce(block):
                 attrs={OP_ROLE_KEY: OpRole.Forward})
 
         block = program.global_block()
+        if current_endpoint is None:
+            assert endpoints is None
+            assert sync
+            _add_sync_by_allreduce(block)
+            return
+
         if core.is_compiled_with_cuda():
             comm_id_var = block.create_var(
                 name=unique_name.generate('nccl_id'),
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index c2d79a62c7663a..bceabeee3c3dce 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 import sys
+import paddle
 from paddle.optimizer import Optimizer
 from paddle.fluid.clip import ClipGradByGlobalNorm
 from ...utils.hybrid_parallel_util import fused_allreduce_gradients
@@ -22,6 +23,8 @@
 from paddle.fluid import framework
 from paddle.fluid.framework import Variable
 from ...utils.log_util import logger
+from paddle.fluid import core
+from paddle.fluid import layers
 
 __all__ = []
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index a0bf4cc5bc0975..481b90910def17 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -138,6 +138,9 @@ def _init_process_group(self, pipeline_pair, pipeline_ring_map):
                 first_node = pair[0] + start_index
                 second_node = pair[1] + start_index
                 if self.rank != first_node and self.rank != second_node:
+                    collective_helper._init_communicator(
+                        self.startup_program, None, None, None, None, False,
+                        self.global_ring_id, True)
                     continue
                 pipeline_endpoints = [
                     self.endpoints[first_node], self.endpoints[second_node]
diff --git a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
index 243f6efe53185d..1333f794cc97e3 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,9 +14,12 @@
 from __future__ import print_function
 from __future__ import division
 import os
+import collections
+import numpy as np
 
 import paddle.fluid as fluid
 from paddle.fluid import core, unique_name
+from paddle.fluid.dygraph import Layer, LayerList
 from ..base.private_helper_function import wait_server_ready
 from .meta_optimizer_base import MetaOptimizerBase
 from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_loss_grad_op, is_backward_op, is_optimizer_op
@@ -38,6 +41,9 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
         super(RawProgramOptimizer, self)._set_basic_info(
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
         self.without_graph_optimization = user_defined_strategy.without_graph_optimization
+        self.fuse_all_reduce_ops = user_defined_strategy.fuse_all_reduce_ops
+        if self.fuse_all_reduce_ops:
+            self.fuse_grad_size_in_num = user_defined_strategy.fuse_grad_size_in_num
 
     def _can_apply(self):
         if not self.role_maker._is_collective:
@@ -113,7 +119,8 @@ def minimize_impl(self,
 
         optimize_ops, params_grads = self.inner_opt.minimize(
             loss, startup_program, parameter_list, no_grad_set)
-
+        if self.nranks == 1:
+            return optimize_ops, params_grads
         self._init_process_group()
 
         self.main_program = program
@@ -123,7 +130,11 @@ def minimize_impl(self,
 
     def _transpile_main_program(self, loss):
         self._insert_loss_grad_ops(loss)
-        self._insert_allreduce_ops()
+        if self.fuse_all_reduce_ops and core.is_compiled_with_npu():
+            self._calc_stream = True
+            self._allreduce_fusion_program()
+        else:
+            self._insert_allreduce_ops()
 
     def _insert_loss_grad_ops(self, loss):
         """
@@ -194,3 +205,260 @@ def _insert_allreduce_ops(self):
                     attrs={'ring_id': ring_id,
                            OP_ROLE_KEY: OpRole.Backward})
                 break
+
+    # TODO(Liu yuang): ADD CUDA allreduce_fusion fuction.
+    # This function helps reduce the input of allreduce by integrating can save communication time.
+    def _allreduce_fusion_program(self):
+        block = self.main_program.global_block()
+        ring_id = self.global_ring_id
+        record_idx, allreduce_input_vars, allreduce_output_vars = [], [], []
+        block_ops = len(list(enumerate(block.ops)))
+
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_backward_op(op) and \
+                    OP_ROLE_VAR_KEY in op.attr_names:
+                op_role_var = op.attr(OP_ROLE_VAR_KEY)
+                if len(op_role_var) == 0:
+                    continue
+                assert len(op_role_var) % 2 == 0
+                for i in range(0, len(op_role_var), 2):
+                    param_name = op_role_var[i]
+                    param = block.var(param_name)
+                    grad_name = op_role_var[i + 1]
+                    grad = block.var(grad_name)
+                    if param.is_distributed:
+                        continue
+                    if ".cast_fp16@GRAD" in grad_name:
+                        param_name = param_name + ".cast_fp16"
+                        if not block.has_var(param_name):
+                            raise ValueError("op cast name error {}".format(
+                                op.type))
+                        else:
+                            param = block.var(param_name)
+
+                    if len(allreduce_output_vars) == 0:
+                        allreduce_output_vars.append([grad])
+                        allreduce_input_vars.append([param])
+                        if self.fuse_grad_size_in_num == 1:
+                            record_idx.append([idx, idx])
+                            continue
+                        record_idx.append([-2, idx])
+                    elif len(allreduce_output_vars[
+                            -1]) == self.fuse_grad_size_in_num:
+                        allreduce_output_vars.append([grad])
+                        allreduce_input_vars.append([param])
+                        if self.fuse_grad_size_in_num == 1:
+                            record_idx.append([idx, idx])
+                            continue
+                        if idx != block_ops - 1:
+                            record_idx.append([-2, idx])
+                    else:
+                        allreduce_output_vars[-1].append(grad)
+                        allreduce_input_vars[-1].append(param)
+                        record_idx[-1][0] = idx
+
+                if record_idx[-1][0] == -2:
+                    record_idx[-1][0] = record_idx[-1][1]
+
+        assert len(allreduce_output_vars) == len(
+            record_idx
+        ), "It has different lens between the allreduce_output_vars and record_idx."
+
+        if not allreduce_output_vars or not allreduce_input_vars:
+            return
+
+        self.vars = collections.OrderedDict()
+        index, offset_pos, pos, offset = 0, 0, 0, 0
+        start, end = record_idx[index]
+        men_list = [end, start]
+
+        # Here we need to explain the flag. When integrating OP, we will encounter different groups of the same Op.
+        # Because we insert coalesce tensor in reverse ops,
+        # we need to use flag to record whether the current OP has been inserted into coalesce tensor。
+        # For example:
+        # [(3, 2), (2, 2), (1, 0)], (3, 2), (2, 2) using same op, but in different groups.
+
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if idx == start:
+                pos = 0
+                flag = True if end == men_list[-1] else False
+                offset = offset_pos if flag else 0
+                done_output_vars, done_input_vars = self._split_fuction(
+                    allreduce_output_vars[index], allreduce_input_vars[index])
+                for id_, done_output_var in enumerate(done_output_vars):
+                    if flag:
+                        tmp_var = block.create_var(
+                            name=unique_name.generate(
+                                'FusedOutput_{}_{}'.format(start, id_ +
+                                                           offset)),
+                            dtype=done_output_var[0].dtype,
+                            persistable=False,
+                            stop_gradient=True)
+                        self.vars['FusedOutput_{}_{}'.format(start, id_ +
+                                                             offset)] = tmp_var
+
+                        block._insert_op(
+                            idx + id_ + offset,
+                            type="coalesce_tensor",
+                            inputs={"Input": done_input_vars[id_]},
+                            outputs={
+                                "Output": done_output_var,
+                                "FusedOutput": tmp_var
+                            },
+                            attrs={
+                                "copy_data": False,
+                                "use_align": True,
+                                "dtype": done_output_var[0].dtype
+                            })
+                        pos += 1
+                    else:
+                        tmp_var = block.create_var(
+                            name=unique_name.generate(
+                                'FusedOutput_{}_{}'.format(start, id_)),
+                            dtype=done_output_var[0].dtype,
+                            persistable=False,
+                            stop_gradient=True)
+                        self.vars['FusedOutput_{}_{}'.format(start,
+                                                             id_)] = tmp_var
+
+                        block._insert_op(
+                            idx + id_,
+                            type="coalesce_tensor",
+                            inputs={"Input": done_input_vars[id_]},
+                            outputs={
+                                "Output": done_output_var,
+                                "FusedOutput": tmp_var
+                            },
+                            attrs={
+                                "copy_data": False,
+                                "use_align": True,
+                                "dtype": done_output_var[0].dtype
+                            })
+                        pos += 1
+                offset_pos = pos
+
+                # TODO(Liu yuang): ADD CUDA and NPU's EVENT and c_allreduce_sum.
+                for id_ in range(len(done_output_vars)):
+                    if flag:
+                        block._insert_op(
+                            end + id_ + pos + 1,
+                            type='c_allreduce_sum',
+                            inputs={
+                                'X': self.vars['FusedOutput_{}_{}'.format(
+                                    start, id_ + offset)]
+                            },
+                            outputs={
+                                'Out': self.vars['FusedOutput_{}_{}'.format(
+                                    start, id_ + offset)]
+                            },
+                            attrs={
+                                'ring_id': ring_id,
+                                'use_calc_stream': True
+                                if self._calc_stream else False,
+                                OP_ROLE_KEY: OpRole.Backward
+                            })
+                    else:
+                        block._insert_op(
+                            end + id_ + pos + 1,
+                            type='c_allreduce_sum',
+                            inputs={
+                                'X': self.vars['FusedOutput_{}_{}'.format(start,
+                                                                          id_)]
+                            },
+                            outputs={
+                                'Out': self.vars['FusedOutput_{}_{}'.format(
+                                    start, id_)]
+                            },
+                            attrs={
+                                'ring_id': ring_id,
+                                'use_calc_stream': True
+                                if self._calc_stream else False,
+                                OP_ROLE_KEY: OpRole.Backward
+                            })
+                index += 1
+                men_list.append(end)
+                men_list.append(start)
+                if len(record_idx) == index:
+                    start = end = -1
+                    continue
+                start, end = record_idx[index]
+
+        if not self._calc_stream:
+            for idx, op in enumerate(block.ops):
+                if is_optimizer_op(op):
+                    block._insert_op(
+                        idx,
+                        type='c_sync_comm_stream',
+                        inputs={'X': block.create_var()},
+                        outputs={'Out': block.create_var()},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Backward
+                        })
+                    break
+
+    # Integrate grads of the same type to form a combination. If skip_comb is selected, will return grads of the same group.
+    # For example:[(fp16, fp16), (fp32), (fp16)] -> [(fp16, fp16, fp16), (fp32)]
+    def _split_fuction(self,
+                       allreduce_output_vars,
+                       allreduce_input_vars,
+                       skip_comb=True):
+        input_vars, final_input_vars, output_vars, final_output_vars = [], [], [], []
+        if len(allreduce_output_vars) - 1 == 0:
+            final_output_vars.append(allreduce_output_vars)
+            final_input_vars.append(allreduce_input_vars)
+            return final_output_vars, final_input_vars
+
+        for idx in range(len(allreduce_input_vars) - 1):
+            if allreduce_input_vars[idx].dtype == allreduce_input_vars[idx +
+                                                                       1].dtype:
+                input_vars.append(allreduce_input_vars[idx])
+                if idx == len(allreduce_input_vars) - 2:
+                    input_vars.append(allreduce_input_vars[idx + 1])
+                    final_input_vars.append(input_vars)
+            else:
+                input_vars.append(allreduce_input_vars[idx])
+                final_input_vars.append(input_vars)
+                input_vars = []
+                if idx == len(allreduce_input_vars) - 2:
+                    input_vars.append(allreduce_input_vars[idx + 1])
+                    final_input_vars.append(input_vars)
+
+        for idx in range(len(allreduce_output_vars) - 1):
+            if allreduce_output_vars[idx].dtype == allreduce_output_vars[
+                    idx + 1].dtype:
+                output_vars.append(allreduce_output_vars[idx])
+                if idx == len(allreduce_output_vars) - 2:
+                    output_vars.append(allreduce_output_vars[idx + 1])
+                    final_output_vars.append(output_vars)
+            else:
+                output_vars.append(allreduce_output_vars[idx])
+                final_output_vars.append(output_vars)
+                output_vars = []
+                if idx == len(allreduce_output_vars) - 2:
+                    output_vars.append(allreduce_output_vars[idx + 1])
+                    final_output_vars.append(output_vars)
+        if skip_comb:
+            input_fp16_vars, input_fp32_vars, output_fp16_vars, output_fp32_vars = [], [], [], []
+            for final_input_var in final_input_vars:
+                if final_input_var[0].dtype == core.VarDesc.VarType.FP16:
+                    input_fp16_vars.extend(final_input_var)
+                else:
+                    input_fp32_vars.extend(final_input_var)
+
+            for final_output_var in final_output_vars:
+                if final_output_var[0].dtype == core.VarDesc.VarType.FP16:
+                    output_fp16_vars.extend(final_output_var)
+                else:
+                    output_fp32_vars.extend(final_output_var)
+            final_output_vars, final_input_vars = [], []
+            if output_fp16_vars:
+                final_output_vars.append(output_fp16_vars)
+            if output_fp32_vars:
+                final_output_vars.append(output_fp32_vars)
+            if input_fp16_vars:
+                final_input_vars.append(input_fp16_vars)
+            if input_fp32_vars:
+                final_input_vars.append(input_fp32_vars)
+
+        return final_output_vars, final_input_vars
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index ca3606c16e5d47..85f114d7f71413 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -368,7 +368,8 @@ def insert_reduce_ops(block,
     for var in reduce_vars:
 
         root_id = get_grad_device(var, shard)
-        assert root_id >= 0, "root id should be a positive int".format(var)
+        assert root_id >= 0, "root id should be a positive int, but now root id is {}".format(
+            root_id)
         block._insert_op_without_sync(
             insert_idx,
             type='c_reduce_sum',
@@ -638,3 +639,8 @@ def append_naive_sync(block, sync_var, ring_id):
             'use_calc_stream': True,
             OP_ROLE_KEY: OpRole.Forward
         })
+    block.append_op(
+        type='c_sync_calc_stream',
+        inputs={'X': [sync_var]},
+        outputs={'Out': [sync_var]},
+        attrs={OP_ROLE_KEY: OpRole.Forward})
diff --git a/python/paddle/distributed/fleet/meta_parallel/__init__.py b/python/paddle/distributed/fleet/meta_parallel/__init__.py
index 894771a3d5005f..0750c2c250e2bb 100644
--- a/python/paddle/distributed/fleet/meta_parallel/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/__init__.py
@@ -15,6 +15,7 @@
 from .parallel_layers import VocabParallelEmbedding  # noqa: F401
 from .parallel_layers import ColumnParallelLinear  # noqa: F401
 from .parallel_layers import RowParallelLinear  # noqa: F401
+from .parallel_layers import ParallelCrossEntropy  # noqa: F401
 from .parallel_layers import LayerDesc  # noqa: F401
 from .parallel_layers import PipelineLayer  # noqa: F401
 from .parallel_layers import RNGStatesTracker  # noqa: F401
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
index 6a33611403ace0..72da962b8914eb 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
@@ -15,6 +15,7 @@
 from .mp_layers import VocabParallelEmbedding  # noqa: F401
 from .mp_layers import ColumnParallelLinear  # noqa: F401
 from .mp_layers import RowParallelLinear  # noqa: F401
+from .mp_layers import ParallelCrossEntropy  # noqa: F401
 from .pp_layers import LayerDesc  # noqa: F401
 from .pp_layers import PipelineLayer  # noqa: F401
 from .random import RNGStatesTracker  # noqa: F401
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
index 730a7430133e06..f091c890f68542 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
@@ -18,6 +18,7 @@
 from paddle.nn import functional as F
 from paddle import framework
 from ...base import topology as tp
+from paddle.autograd import PyLayer
 
 __all__ = []
 
@@ -43,14 +44,13 @@ def __init__(self,
         self.origin_num_embeddings = num_embeddings
         self.is_mp = (self.world_size > 1)
 
-        per_part_size = (
-            num_embeddings + self.world_size - 1) // self.world_size
-        last_part_size = num_embeddings - per_part_size * (self.world_size - 1)
-        if self.rank == self.world_size - 1:
-            per_part_size = last_part_size
-        per_part_size += 1  # make the last row as the padding index
-        self.per_part_size = per_part_size
+        assert num_embeddings % self.world_size == 0, (
+            "The length of the vocabulary must be divisible by the parallelism degree of MP"
+        )
+
+        per_part_size = num_embeddings // self.world_size
 
+        self.vocab_start_index = self.rank * per_part_size
         self._dtype = self._helper.get_default_dtype()
         self._size = [per_part_size, embedding_dim]
         self._weight_attr = weight_attr
@@ -63,49 +63,35 @@ def __init__(self,
                     shape=self._size,
                     dtype=self._dtype,
                     is_bias=False)
-            self.weight[per_part_size - 1] = 0.0
-            self.weight.is_distributed = True
         else:
             self.weight = self.create_parameter(
                 attr=self._weight_attr,
-                shape=[num_embeddings, embedding_dim],
+                shape=self._size,
                 dtype=self._dtype,
                 is_bias=False)
 
+        self.weight.is_distributed = True
+
     def forward(self, x):
-        if not self.is_mp:
-            return F.embedding(
+        if self.is_mp:
+            output_parallel = paddle.distributed.collective._c_lookup_table(
+                self.weight,
+                x,
+                start_index=self.vocab_start_index,
+                name=self._name)
+            output = paddle.distributed.collective._mp_allreduce(
+                output_parallel,
+                group=self.model_parallel_group,
+                use_calc_stream=True,
+                use_model_parallel=True)
+        else:
+            output = F.embedding(
                 x,
                 weight=self.weight,
                 padding_idx=None,
                 sparse=False,
                 name=self._name)
-
-        origin_input_shape = x.shape
-        if len(origin_input_shape) == 2:
-            x = paddle.unsqueeze(x, axis=-1)
-        else:
-            assert origin_input_shape[-1] == 1, (
-                "The last dimension size of x must be 1.")
-        x_shard = paddle.shard_index(x, self.origin_num_embeddings,
-                                     self.world_size, self.rank,
-                                     self.per_part_size - 1)
-        if len(origin_input_shape) == 2:
-            x_shard = paddle.squeeze(x_shard, axis=-1)
-
-        emb_out = F.embedding(
-            x_shard,
-            weight=self.weight,
-            padding_idx=self.per_part_size - 1,
-            sparse=False,
-            name=self._name)
-
-        emb_out = paddle.distributed.collective._mp_allreduce(
-            emb_out,
-            group=self.model_parallel_group,
-            use_calc_stream=True,
-            use_model_parallel=True)
-        return emb_out
+        return output
 
 
 class ColumnParallelLinear(Layer):
@@ -175,9 +161,7 @@ def forward(self, x):
 
         if self.gather_output and self.is_mp:
             output = paddle.distributed.collective._c_concat(
-                output_parallel,
-                nranks=self.world_size,
-                group=self.model_parallel_group)
+                output_parallel, group=self.model_parallel_group)
         else:
             output = output_parallel
         return output
@@ -245,10 +229,7 @@ def forward(self, x):
         else:
             # split last dim
             input_parallel = paddle.distributed.collective._c_split(
-                x,
-                rank=self.rank,
-                nranks=self.world_size,
-                group=self.model_parallel_group)
+                x, group=self.model_parallel_group)
 
         output_parallel = F.linear(input_parallel, self.weight, name=self._name)
 
@@ -263,3 +244,19 @@ def forward(self, x):
 
         output = output_ + self.bias if self.bias is not None else output_
         return output
+
+
+class ParallelCrossEntropy(Layer):
+    def __init__(self, name=None):
+        super(ParallelCrossEntropy, self).__init__()
+        self.name = name
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        )
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        )
+        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank()
+
+    def forward(self, input, label):
+        loss = paddle.distributed.collective._c_softmax_with_cross_entropy(
+            input, label, group=self.model_parallel_group)
+        return loss
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index 087942e70a2263..f9cedba7773fbf 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -841,8 +841,7 @@ def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
                     fs_src_path))
 
             if self.is_exist(fs_dst_path):
-                raise FSFileExistsError("{} exists already".format(
-                    fs_src_path, fs_dst_path, fs_dst_path))
+                raise FSFileExistsError("{} exists already".format(fs_dst_path))
 
         return self._try_mv(fs_src_path, fs_dst_path)
 
diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py
old mode 100644
new mode 100755
index e58c8aa1625dde..78503baf2fd5d2
--- a/python/paddle/distributed/fleet/utils/recompute.py
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -97,10 +97,12 @@ def forward(ctx, run_function, preserve_rng_state, *args):
             ctx.fw_cuda_rng_state = paddle.get_cuda_rng_state()
 
         # TODO support AMP
+        tracer = framework._dygraph_tracer()
+        ctx.is_fw_autocast = tracer._enable_autocast
+        ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()
 
         with paddle.no_grad():
             outputs = run_function(*args)
-
         return outputs
 
     @staticmethod
@@ -119,15 +121,23 @@ def backward(ctx, *args):
             tracer = framework._dygraph_tracer()
             tracer._has_grad = True
 
-            # TODO support AMP
-
+            # NOTE support AMP
+            # need restore auto_cast state as well as w/b list
             if ctx.preserve_rng_state:
                 with swith_rng_state(ctx.fw_cuda_rng_state):
+                    with paddle.amp.auto_cast(
+                            enable=ctx.is_fw_autocast,
+                            custom_white_list=ctx.amp_white_list,
+                            custom_black_list=ctx.amp_black_list):
+                        detached_inputs = detach_variable(tuple(inputs))
+                        outputs = ctx.run_function(*detached_inputs)
+            else:
+                with paddle.amp.auto_cast(
+                        enable=ctx.is_fw_autocast,
+                        custom_white_list=ctx.amp_white_list,
+                        custom_black_list=ctx.amp_black_list):
                     detached_inputs = detach_variable(tuple(inputs))
                     outputs = ctx.run_function(*detached_inputs)
-            else:
-                detached_inputs = detach_variable(tuple(inputs))
-                outputs = ctx.run_function(*detached_inputs)
 
             if isinstance(outputs, core.VarBase):
                 outputs = (outputs, )
@@ -155,7 +165,6 @@ def backward(ctx, *args):
 
             grads = list(inp._grad_ivar() for inp in detached_inputs
                          if isinstance(inp, core.VarBase))
-
             return grads
 
 
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index e84025c2eb6d20..447c059537ba3f 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -25,6 +25,7 @@
 from contextlib import closing
 import socket
 from paddle.fluid import core
+from distutils.util import strtobool
 
 __all__ = [     #noqa
            'get_host_name_ip',
@@ -166,7 +167,7 @@ def __eq__(self, cluster):
     def __ne__(self, cluster):
         return not self.__eq__(cluster)
 
-    def update_pods(cluster):
+    def update_pods(self, cluster):
         self.pods = copy.copy(cluster.pods)
 
     def trainers_nranks(self):
@@ -264,7 +265,7 @@ def __eq__(self, pod):
                 self.id != pod.id or \
                 self.addr != pod.addr or \
                 self.port != pod.port:
-            logger.debug("pod {} != pod".format(self, pod))
+            logger.debug("pod {} != {}".format(self, pod))
             return False
 
         if len(self.trainers) != len(pod.trainers):
@@ -384,7 +385,7 @@ def add_arguments(argname, type, default, help, argparser, **kwargs):
         add_argument("name", str, "Jonh", "User name.", parser)
         args = parser.parse_args()
     """
-    type = distutils.util.strtobool if type == bool else type
+    type = strtobool if type == bool else type
     argparser.add_argument(
         "--" + argname,
         default=default,
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 25412a86a8b940..708167a0273996 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -456,7 +456,7 @@ def _addup_repetitive_outputs_(op_descs, block_idx):
     In these cases, the variable should be the accumulation of all the outputs.
     `sum_op`s are added to implement the accumulate.
     """
-    _MAX_ADD_NUM_ = core.globals()['FLAGS_max_inplace_grad_add']
+    _MAX_ADD_NUM_ = framework._global_flags()['FLAGS_max_inplace_grad_add']
     #pending_sum_ops = []
     pending_sum_ops = collections.OrderedDict()
     var_rename_count = collections.defaultdict(int)
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index 3cb9fe75559b16..d5d2e7a0d96396 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -303,14 +303,23 @@ def apply_gradients(self, params_grads):
         if self._is_distributed:
             # if distributed, split check_finite_and_unscale to overlap
             # unscale with communication
-            for p, g in params_grads:
-                with self._train_program._optimized_guard([p, g]):
+            if core.is_compiled_with_npu():
+                with self._train_program._optimized_guard(grads):
                     _, found_inf = check_finite_and_unscale(
-                        [g, ],
+                        grads,
                         self._loss_scaling,
                         name="find_infinite_scale",
                         float_status=self._float_status)
                     found_infs.append(found_inf)
+            else:
+                for p, g in params_grads:
+                    with self._train_program._optimized_guard([p, g]):
+                        _, found_inf = check_finite_and_unscale(
+                            [g, ],
+                            self._loss_scaling,
+                            name="find_infinite_scale",
+                            float_status=self._float_status)
+                        found_infs.append(found_inf)
         elif self._use_pure_fp16:
             if fp32_grads:
                 with self._train_program._optimized_guard(fp32_grads):
diff --git a/python/paddle/fluid/contrib/model_stat.py b/python/paddle/fluid/contrib/model_stat.py
index ca4bfac5ba5a14..11ab8800f287f4 100644
--- a/python/paddle/fluid/contrib/model_stat.py
+++ b/python/paddle/fluid/contrib/model_stat.py
@@ -150,6 +150,7 @@ def _format_summary(collected_ops_list):
     '''
     _verify_dependent_package()
 
+    from prettytable import PrettyTable
     summary_table = PrettyTable(
         ["No.", "TYPE", "INPUT", "OUTPUT", "PARAMs", "FLOPs"])
     summary_table.align = 'r'
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 66b11d1f17ad41..600ce6397e1af3 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -251,24 +251,25 @@ def __init__(self,
         super(ImperativeQuantizeInputs, self).__init__()
 
         self._quantizable_layer_type = tuple(
-            utils.quant_input_layers_map[layer]
-            if layer in utils.quant_input_layers_map else layer
+            utils.layer_name_map[layer]
+            if layer in utils.layer_name_map else layer
             for layer in quantizable_layer_type)
         for layer in self._quantizable_layer_type:
-            assert not isinstance(layer, str), \
+            assert not isinstance(layer, str) \
+                and layer in utils.fake_quant_input_layers, \
                 "%s is unspported to be quantized." % layer
 
         quantize_type = {
             'abs_max', 'moving_average_abs_max', 'channel_wise_abs_max'
         }
-        assert weight_quantize_type in quantize_type, \
+        assert weight_quantize_type != 'moving_average_abs_max' \
+            and weight_quantize_type in quantize_type, \
             "Unsupported weight_quantize_type: %s. It can only " \
-            "be abs_max or moving_average_abs_max or " \
-            "channel_wise_abs_max." % weight_quantize_type
-        assert activation_quantize_type != 'channel_wise_abs_max' \
-            and activation_quantize_type in quantize_type, \
+            "be abs_max or channel_wise_abs_max." % weight_quantize_type
+        # TODO (jc): activation_quantize_type supports range_abs_max
+        assert activation_quantize_type == 'moving_average_abs_max', \
             "Unsupported activation_quantize_type: %s. It can " \
-            "only be abs_max or moving_average_abs_max now." \
+            "only be moving_average_abs_max now." \
             % activation_quantize_type
 
         bits_check = lambda bits: isinstance(bits, int) \
@@ -305,30 +306,22 @@ def apply(self, model):
         assert isinstance(model, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
 
-        for name, layer in model.named_sublayers():
-            if not isinstance(layer, self._quantizable_layer_type) \
-                or (hasattr(layer, "skip_quant") \
-                    and layer.skip_quant == True):
+        for name, cur_layer in model.named_sublayers():
+            if not isinstance(cur_layer, self._quantizable_layer_type) \
+                or (hasattr(cur_layer, "skip_quant") \
+                    and cur_layer.skip_quant == True):
                 continue
 
-            # TODO(jc): optimize this module
-            last_idx = 0
-            idx = 0
-            obj = model
-            while idx < len(name):
-                if (name[idx] == '.'):
-                    if hasattr(obj, name[last_idx:idx]):
-                        obj = getattr(obj, name[last_idx:idx])
-                        last_idx = idx + 1
-                idx += 1
-            target = name[last_idx:idx]
-
-            quant_layer = self._get_input_quantized_layer(layer)
-            setattr(obj, target, quant_layer)
+            parent_layer, sub_name = \
+                utils.find_parent_layer_and_sub_name(model, name)
+
+            cur_quant_layer = self._get_input_quantized_layer(cur_layer)
+            setattr(parent_layer, sub_name, cur_quant_layer)
 
     def _get_input_quantized_layer(self, layer):
         quant_layer_name = None
-        for key, value in utils.quant_input_layers_map.items():
+
+        for key, value in utils.layer_name_map.items():
             if isinstance(layer, value):
                 quant_layer_name = 'Quantized' + key
                 break
@@ -336,10 +329,6 @@ def _get_input_quantized_layer(self, layer):
             "The layer %s is unsupported to be quantized." \
             % layer.full_name()
 
-        layer_with_weight = ['QuantizedConv2D', 'QuantizedLinear']
-        if quant_layer_name not in layer_with_weight:
-            quant_layer_name = 'QuantizedNoweightLayer'
-
         return quant_nn.__dict__[quant_layer_name](layer, **self._kwargs)
 
 
@@ -374,25 +363,21 @@ def apply(self, model):
         assert isinstance(model, dygraph.Layer), \
             "The model must be the instance of dygraph.Layer."
 
-        for name, layer in model.named_sublayers():
-            if not self._is_target_layer(layer):
+        for cur_name, cur_layer in model.named_sublayers():
+            if not self._is_target_layer(cur_layer):
                 continue
 
-            # TODO(jc): optimize this module
-            last_idx = 0
-            idx = 0
-            obj = model
-            while idx < len(name):
-                if (name[idx] == '.'):
-                    if hasattr(obj, name[last_idx:idx]):
-                        obj = getattr(obj, name[last_idx:idx])
-                        last_idx = idx + 1
-                idx += 1
-            target = name[last_idx:idx]
-
-            quant_layer = quant_nn.__dict__["QuantizedOutputLayer"](
-                layer, self._moving_rate)
-            setattr(obj, target, quant_layer)
+            parent_layer, sub_name = \
+                utils.find_parent_layer_and_sub_name(model, cur_name)
+
+            if isinstance(cur_layer, tuple(utils.fake_quant_output_layers)):
+                cur_quant_layer = quant_nn.FakeQuantMAOutputScaleLayer(
+                    cur_layer, self._moving_rate)
+            else:
+                cur_quant_layer = quant_nn.MAOutputScaleLayer(cur_layer,
+                                                              self._moving_rate)
+
+            setattr(parent_layer, sub_name, cur_quant_layer)
 
     def save_quantized_model(self, layer, path, input_spec=None, **config):
         """
@@ -468,9 +453,18 @@ def _is_target_layer(self, layer):
         """
         Whether the layer needs to calculate output scales.
         """
-        return isinstance(layer, utils.quant_output_layers) \
-            or ('quantized' in layer.full_name() and \
-                'quantized_noweight' not in layer.full_name())
+        flag = False
+        if isinstance(layer, dygraph.Layer):
+            # exclude fake_quant ops in quant_nn file
+            if utils.is_leaf_layer(layer) and \
+                not isinstance(layer, tuple(utils.fake_quant_leaf_layers)):
+                flag = True
+            # consider QuantizedConv2D and QuantizedLinear ops
+            if isinstance(layer, tuple(utils.fake_quant_wrap_layers)):
+                flag = True
+        if isinstance(layer, paddle.nn.quant.FloatFunctionalLayer):
+            flag = True
+        return flag
 
     def _save_output_scale(self, program, scope):
         """
@@ -514,4 +508,4 @@ def _is_skip_quant_op(self, block, in_op):
         previous_ops = [utils.find_previous_op(block, arg_name) \
             for arg_name in in_op.input_arg_names]
         return any(op is not None and op.type not in \
-            utils.fake_quantize_dequantize_types for op in previous_ops)
+            utils.fake_quantize_dequantize_op_types for op in previous_ops)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
index f6fef0689d43af..fd1f7f423ff8f4 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
@@ -22,17 +22,28 @@
 from paddle.fluid.initializer import Constant
 from paddle.fluid.data_feeder import check_variable_and_dtype
 from paddle.nn import functional as F
+import logging
+from paddle.fluid.log_helper import get_logger
 
 __all__ = [
-    'FakeQuantMovingAverage', 'FakeQuantAbsMax',
-    'FakeChannelWiseQuantDequantAbsMax', 'QuantizedConv2D', 'QuantizedLinear',
-    'QuantizedNoweightLayer', 'MovingAverageAbsMaxScale'
+    'FakeQuantMovingAverageAbsMax',
+    'FakeQuantAbsMax',
+    'FakeQuantChannelWiseAbsMax',
+    'QuantizedConv2D',
+    'QuantizedLinear',
+    'QuantizedNoweightLayer',
+    'MovingAverageAbsMaxScale',
+    'MAOutputScaleLayer',
+    'FakeQuantMAOutputScaleLayer',
 ]
 
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
-class FakeQuantMovingAverage(layers.Layer):
+
+class FakeQuantMovingAverageAbsMax(layers.Layer):
     r"""
-    FakeQuantMovingAverage layer does the moving_average_abs_max quant and then dequant.
+    FakeQuantMovingAverageAbsMax layer does the moving_average_abs_max quant and then dequant.
     Its computational formula is described as below:
 
     :math:`scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)`
@@ -45,7 +56,7 @@ def __init__(self,
                  moving_rate=0.9,
                  quant_bits=8,
                  dtype='float32'):
-        super(FakeQuantMovingAverage, self).__init__()
+        super(FakeQuantMovingAverageAbsMax, self).__init__()
         self._moving_rate = moving_rate
         self._quant_bits = quant_bits
 
@@ -98,7 +109,7 @@ def forward(self, input):
             return out
 
         check_variable_and_dtype(input, 'input', ['float32'],
-                                 "FakeQuantMovingAverage")
+                                 "FakeQuantMovingAverageAbsMax")
         attrs = {
             'moving_rate': self._moving_rate,
             'bit_length': self._quant_bits,
@@ -210,7 +221,7 @@ def forward(self, input):
         return quant_out
 
 
-class FakeChannelWiseQuantDequantAbsMax(layers.Layer):
+class FakeQuantChannelWiseAbsMax(layers.Layer):
     def __init__(self,
                  name=None,
                  channel_num=None,
@@ -219,7 +230,7 @@ def __init__(self,
                  dtype='float32',
                  quant_on_weight=False):
         assert quant_on_weight == True, "Channel_wise only can be used on weight quantization."
-        super(FakeChannelWiseQuantDequantAbsMax, self).__init__()
+        super(FakeQuantChannelWiseAbsMax, self).__init__()
         self._quant_bits = quant_bits
         self._quant_axis = quant_axis
         self._dtype = dtype
@@ -265,7 +276,7 @@ def forward(self, input):
             return out
 
         check_variable_and_dtype(input, 'input', ['float32'],
-                                 "FakeChannelWiseQuantDequantAbsMax")
+                                 "FakeQuantChannelWiseAbsMax")
         attrs = {'bit_length': self._quant_bits, 'quant_axis': self._quant_axis}
         inputs = {"X": [input]}
         quant_out = self._helper.create_variable(
@@ -313,8 +324,8 @@ def _get_fake_quant_type(quant_type, **kwargs):
             "when you use channel_wise_abs_max strategy.")
     fake_quant_map = {
         'abs_max': FakeQuantAbsMax,
-        'moving_average_abs_max': FakeQuantMovingAverage,
-        'channel_wise_abs_max': FakeChannelWiseQuantDequantAbsMax
+        'moving_average_abs_max': FakeQuantMovingAverageAbsMax,
+        'channel_wise_abs_max': FakeQuantChannelWiseAbsMax
     }
 
     return fake_quant_map[quant_type](**call_args)
@@ -498,12 +509,7 @@ def __init__(self,
             quant_on_weight=False)
 
     def forward(self, input):
-        quant_input = self._fake_quant_input(input)
-        # TODO (jc): support ops that have several inputs
-        if isinstance(input, list):
-            assert len(input) == 1, \
-                "The QuantizedNoweightLayer should only have one input."
-        return self._layer.forward(quant_input)
+        return self._layer.forward(self._fake_quant_input(input))
 
 
 class MovingAverageAbsMaxScale(layers.Layer):
@@ -590,19 +596,56 @@ def forward(self, input):
         return quant_out
 
 
-class QuantizedOutputLayer(layers.Layer):
-    def __init__(self, layer=None, moving_rate=0.9, dtype='float32'):
+class MAOutputScaleLayer(layers.Layer):
+    """
+    Calculate the scale (moving average abs max) for the output of the input layer.
+    Add MovingAverageMaxScale layer to the behind of the input layer.
+    """
+
+    def __init__(self, layer=None, moving_rate=0.9, name=None, dtype='float32'):
         r"""
-        Add MovingAverageMaxScale layer to the behind of the input layer.
+        Construct
         """
-        super(QuantizedOutputLayer, self).__init__()
+        super(MAOutputScaleLayer, self).__init__()
         self._layer = layer
-        self._moving_average_abs_max_scale = \
-            MovingAverageAbsMaxScale(layer.full_name(), moving_rate, dtype)
+        if name is None:
+            name = layer.full_name()
+        self._ma_output_scale = \
+            MovingAverageAbsMaxScale(name, moving_rate, dtype)
+
+    def forward(self, *inputs, **kwargs):
+        out = self._layer(*inputs, **kwargs)
+        # TODO (jc): support the ops of several outputs
+        if (isinstance(out, list) or isinstance(out, tuple)) and len(out) > 1:
+            return out
+        else:
+            return self._ma_output_scale(out)
 
-    def forward(self, input):
-        if isinstance(input, list):
-            assert len(input) == 1, \
-                "The QuantizedOutputLayer should only have one input."
-        out = self._layer(input)
-        return self._moving_average_abs_max_scale(out)
+
+class FakeQuantMAOutputScaleLayer(layers.Layer):
+    def __init__(self,
+                 layer,
+                 weight_bits=8,
+                 activation_bits=8,
+                 moving_rate=0.9,
+                 name=None,
+                 *args,
+                 **kwargs):
+
+        super(FakeQuantMAOutputScaleLayer, self).__init__()
+        self._layer = layer
+        self._fake_quant_output = _get_fake_quant_type(
+            'moving_average_abs_max',
+            name=layer.full_name() if name is None else name,
+            moving_rate=moving_rate,
+            quant_bits=activation_bits,
+            dtype=self._dtype,
+            quant_on_weight=False)
+
+    def forward(self, *inputs, **kwargs):
+        out = self._layer(*inputs, **kwargs)
+        # TODO (jc): support the ops of several outputs
+        if (isinstance(out, list) or isinstance(out, tuple)) and len(out) > 1:
+            return out
+        else:
+            return self._fake_quant_output(out)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
index 491f8a7e25cbcd..94639b9cc68f94 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -13,9 +13,11 @@
 # limitations under the License.
 
 import paddle
+from paddle.fluid import dygraph
 import numpy as np
+from . import quant_nn
 
-quant_input_layers_map = {
+layer_name_map = {
     'Conv2D': paddle.nn.Conv2D,
     'Linear': paddle.nn.Linear,
     'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D,
@@ -37,30 +39,38 @@
     'LayerNorm': paddle.nn.LayerNorm,
 }
 
-fake_quantize_dequantize_types = [
-    "fake_quantize_dequantize_abs_max",
-    "fake_channel_wise_quantize_dequantize_abs_max",
-    "fake_quantize_dequantize_moving_average_abs_max"
+# Apply fake quant for the inputs of these layers
+# TODO (jc): support paddle.nn.Conv2DTranspose
+fake_quant_input_layers = [paddle.nn.Conv2D, paddle.nn.Linear]
+
+# Apply fake quant for the output of these layers
+# TODO(jc): fix the problem of adding duplicate fake_quant ops
+# paddle.nn.AdaptiveAvgPool2D, paddle.nn.AvgPool2D, paddle.nn.ReLU,paddle.nn.LeakyReLU
+fake_quant_output_layers = [
+    paddle.nn.quant.add, paddle.nn.quant.subtract, paddle.nn.quant.multiply,
+    paddle.nn.quant.divide
+]
+
+fake_quant_leaf_layers = [
+    quant_nn.FakeQuantAbsMax,
+    quant_nn.FakeQuantChannelWiseAbsMax,
+    quant_nn.FakeQuantMovingAverageAbsMax,
+    quant_nn.MovingAverageAbsMaxScale,
 ]
 
-quant_output_layers = (
-    paddle.nn.Conv2D, paddle.nn.Conv2DTranspose, paddle.nn.Linear,
-    paddle.nn.AdaptiveAvgPool2D, paddle.nn.AdaptiveMaxPool2D,
-    paddle.nn.AvgPool2D, paddle.nn.MaxPool2D, paddle.nn.BatchNorm,
-    paddle.nn.BatchNorm2D, paddle.nn.LayerNorm, paddle.nn.SyncBatchNorm,
-    paddle.nn.ELU, paddle.nn.GELU, paddle.nn.Hardshrink, paddle.nn.Hardsigmoid,
-    paddle.nn.Hardswish, paddle.nn.Hardtanh, paddle.nn.LeakyReLU,
-    paddle.nn.LogSigmoid, paddle.nn.LogSoftmax, paddle.nn.Maxout,
-    paddle.nn.PReLU, paddle.nn.ReLU, paddle.nn.ReLU6, paddle.nn.SELU,
-    paddle.nn.Sigmoid, paddle.nn.Softmax, paddle.nn.Softplus,
-    paddle.nn.Softshrink, paddle.nn.Softsign, paddle.nn.Swish, paddle.nn.Tanh,
-    paddle.nn.Tanhshrink, paddle.nn.ThresholdedReLU, paddle.nn.Upsample)
+fake_quant_wrap_layers = [quant_nn.QuantizedConv2D, quant_nn.QuantizedLinear]
 
 weight_op_types = [
     "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose",
     "depthwise_conv2d_transpose"
 ]
 
+fake_quantize_dequantize_op_types = [
+    "fake_quantize_dequantize_abs_max",
+    "fake_channel_wise_quantize_dequantize_abs_max",
+    "fake_quantize_dequantize_moving_average_abs_max"
+]
+
 
 def load_variable_data(scope, var_name):
     '''
@@ -90,3 +100,36 @@ def find_next_ops(block, var_name):
         if var_name in op.input_arg_names:
             res_ops.append(op)
     return res_ops
+
+
+def find_parent_layer_and_sub_name(model, name):
+    """
+    Given the model and the name of a layer, find the parent layer and
+    the sub_name of the layer.
+    For example, if name is 'block_1/convbn_1/conv_1', the parent layer is
+    'block_1/convbn_1' and the sub_name is `conv_1`.
+    """
+    assert isinstance(model, dygraph.Layer), \
+            "The model must be the instance of paddle.nn.Layer."
+    assert len(name) > 0, "The input (name) should not be empty."
+
+    last_idx = 0
+    idx = 0
+    parent_layer = model
+    while idx < len(name):
+        if name[idx] == '.':
+            sub_name = name[last_idx:idx]
+            if hasattr(parent_layer, sub_name):
+                parent_layer = getattr(parent_layer, sub_name)
+                last_idx = idx + 1
+        idx += 1
+    sub_name = name[last_idx:idx]
+    return parent_layer, sub_name
+
+
+def is_leaf_layer(layer):
+    """
+    Whether the layer is leaf layer.
+    """
+    return isinstance(layer, dygraph.Layer) \
+        and len(layer.sublayers()) == 0
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 758e01b8245a2d..20c60dc58b78dc 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -25,21 +25,21 @@ function(inference_analysis_python_api_int8_test_mkldnn target model_dir data_pa
     _inference_analysis_python_api_int8_test(${target} ${model_dir} ${data_path} ${filename} True)
 endfunction()
 
-function(download_quant_data install_dir data_file)
+function(download_quant_data install_dir data_file check_sum)
     if (NOT EXISTS ${install_dir}/${data_file})
-	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file})
+	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file} ${check_sum})
     endif()
 endfunction()
 
-function(download_quant_model install_dir data_file)
+function(download_quant_model install_dir data_file check_sum)
     if (NOT EXISTS ${install_dir}/${data_file})
-	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file})
+	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file} ${check_sum})
     endif()
 endfunction()
 
-function(download_quant_fp32_model install_dir data_file)
+function(download_quant_fp32_model install_dir data_file check_sum)
     if (NOT EXISTS ${install_dir}/${data_file})
-	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models/fp32 ${data_file})
+	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models/fp32 ${data_file} ${check_sum})
     endif()
 endfunction()
 
@@ -86,15 +86,15 @@ function(inference_quant2_int8_nlp_test target quant_model_dir fp32_model_dir da
 		 --ops_to_quantize ${ops_to_quantize})
 endfunction()
 
-function(download_quant_data install_dir data_file)
+function(download_quant_data install_dir data_file check_sum)
     if (NOT EXISTS ${install_dir}/${data_file})
-           inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file})
+           inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file} ${check_sum})
     endif()
 endfunction()
 
-function(download_quant_model install_dir data_file)
+function(download_quant_model install_dir data_file check_sum)
     if (NOT EXISTS ${install_dir}/${data_file})
-           inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file})
+           inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file} ${check_sum})
     endif()
 endfunction()
 
@@ -149,43 +149,43 @@ if(LINUX AND WITH_MKLDNN)
 	# Quant ResNet50
 	set(QUANT_RESNET50_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant")
 	set(QUANT_RESNET50_MODEL_ARCHIVE "ResNet50_qat_model.tar.gz")
-	download_quant_model(${QUANT_RESNET50_MODEL_DIR} ${QUANT_RESNET50_MODEL_ARCHIVE})
+	download_quant_model(${QUANT_RESNET50_MODEL_DIR} ${QUANT_RESNET50_MODEL_ARCHIVE} ff89b934ab961c3a4a844193ece2e8a7)
 	inference_quant_int8_image_classification_test(test_quant_int8_resnet50_mkldnn ${QUANT_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	# Quant ResNet101
 	set(QUANT_RESNET101_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet101_quant")
 	set(QUANT_RESNET101_MODEL_ARCHIVE "ResNet101_qat_model.tar.gz")
-	download_quant_model(${QUANT_RESNET101_MODEL_DIR} ${QUANT_RESNET101_MODEL_ARCHIVE})
+	download_quant_model(${QUANT_RESNET101_MODEL_DIR} ${QUANT_RESNET101_MODEL_ARCHIVE} 95c6d01e3aeba31c13efb2ba8057d558)
 	# inference_quant_int8_image_classification_test(test_quant_int8_resnet101_mkldnn ${QUANT_RESNET101_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	# Quant GoogleNet
 	set(QUANT_GOOGLENET_MODEL_DIR "${QUANT_INSTALL_DIR}/GoogleNet_quant")
 	set(QUANT_GOOGLENET_MODEL_ARCHIVE "GoogleNet_qat_model.tar.gz")
-	download_quant_model(${QUANT_GOOGLENET_MODEL_DIR} ${QUANT_GOOGLENET_MODEL_ARCHIVE})
+	download_quant_model(${QUANT_GOOGLENET_MODEL_DIR} ${QUANT_GOOGLENET_MODEL_ARCHIVE} 1d4a7383baa63e7d1c423e8db2b791d5)
 	inference_quant_int8_image_classification_test(test_quant_int8_googlenet_mkldnn ${QUANT_GOOGLENET_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	# Quant MobileNetV1
 	set(QUANT_MOBILENETV1_MODEL_DIR "${QUANT_INSTALL_DIR}/MobileNetV1_quant")
 	set(QUANT_MOBILENETV1_MODEL_ARCHIVE "MobileNetV1_qat_model.tar.gz")
-	download_quant_model(${QUANT_MOBILENETV1_MODEL_DIR} ${QUANT_MOBILENETV1_MODEL_ARCHIVE})
+	download_quant_model(${QUANT_MOBILENETV1_MODEL_DIR} ${QUANT_MOBILENETV1_MODEL_ARCHIVE} 3b774d94a9fcbb604d09bdb731fc1162)
 	inference_quant_int8_image_classification_test(test_quant_int8_mobilenetv1_mkldnn ${QUANT_MOBILENETV1_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	# Quant MobileNetV2
 	set(QUANT_MOBILENETV2_MODEL_DIR "${QUANT_INSTALL_DIR}/MobileNetV2_quant")
 	set(QUANT_MOBILENETV2_MODEL_ARCHIVE "MobileNetV2_qat_model.tar.gz")
-	download_quant_model(${QUANT_MOBILENETV2_MODEL_DIR} ${QUANT_MOBILENETV2_MODEL_ARCHIVE})
+	download_quant_model(${QUANT_MOBILENETV2_MODEL_DIR} ${QUANT_MOBILENETV2_MODEL_ARCHIVE} 758a99d9225d8b73e1a8765883f96cdd)
 	inference_quant_int8_image_classification_test(test_quant_int8_mobilenetv2_mkldnn ${QUANT_MOBILENETV2_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	# Quant VGG16
 	set(QUANT_VGG16_MODEL_DIR "${QUANT_INSTALL_DIR}/VGG16_quant")
 	set(QUANT_VGG16_MODEL_ARCHIVE "VGG16_qat_model.tar.gz")
-	download_quant_model(${QUANT_VGG16_MODEL_DIR} ${QUANT_VGG16_MODEL_ARCHIVE})
+	download_quant_model(${QUANT_VGG16_MODEL_DIR} ${QUANT_VGG16_MODEL_ARCHIVE} c37e63ca82a102f47be266f8068b0b55)
 	# inference_quant_int8_image_classification_test(test_quant_int8_vgg16_mkldnn ${QUANT_VGG16_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	# Quant VGG19
 	set(QUANT_VGG19_MODEL_DIR "${QUANT_INSTALL_DIR}/VGG19_quant")
 	set(QUANT_VGG19_MODEL_ARCHIVE "VGG19_qat_model.tar.gz")
-	download_quant_model(${QUANT_VGG19_MODEL_DIR} ${QUANT_VGG19_MODEL_ARCHIVE})
+	download_quant_model(${QUANT_VGG19_MODEL_DIR} ${QUANT_VGG19_MODEL_ARCHIVE} 62bcd4b6c3ca2af67e8251d1c96ea18f)
 	# inference_quant_int8_image_classification_test(test_quant_int8_vgg19_mkldnn ${QUANT_VGG19_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	### Quant2 for image classification
@@ -194,7 +194,7 @@ if(LINUX AND WITH_MKLDNN)
 	# with weight scales in `fake_dequantize_max_abs` operators
         set(QUANT2_RESNET50_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2")
 	set(QUANT2_RESNET50_MODEL_ARCHIVE "ResNet50_qat_perf.tar.gz")
-	download_quant_model(${QUANT2_RESNET50_MODEL_DIR} ${QUANT2_RESNET50_MODEL_ARCHIVE})
+	download_quant_model(${QUANT2_RESNET50_MODEL_DIR} ${QUANT2_RESNET50_MODEL_ARCHIVE} e87309457e8c462a579340607f064d66)
 	set(FP32_RESNET50_MODEL_DIR "${INT8_INSTALL_DIR}/resnet50")
 	inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_mkldnn ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
@@ -202,20 +202,20 @@ if(LINUX AND WITH_MKLDNN)
 	# with weight scales in `fake_dequantize_max_abs` operators
 	set(QUANT2_RESNET50_RANGE_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2_range")
 	set(QUANT2_RESNET50_RANGE_MODEL_ARCHIVE "ResNet50_qat_range.tar.gz")
-	download_quant_model(${QUANT2_RESNET50_RANGE_MODEL_DIR} ${QUANT2_RESNET50_RANGE_MODEL_ARCHIVE})
+	download_quant_model(${QUANT2_RESNET50_RANGE_MODEL_DIR} ${QUANT2_RESNET50_RANGE_MODEL_ARCHIVE} 2fdc8a139f041c0d270abec826b2d304)
 	inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_range_mkldnn ${QUANT2_RESNET50_RANGE_MODEL_DIR}/ResNet50_qat_range ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	# Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes,
 	# with weight scales in `fake_channel_wise_dequantize_max_abs` operators
 	set(QUANT2_RESNET50_CHANNELWISE_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2_channelwise")
 	set(QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE "ResNet50_qat_channelwise.tar.gz")
-	download_quant_model(${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR} ${QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE})
+	download_quant_model(${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR} ${QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE} 887a1b1b0e9a4efd10f263a43764db26)
 	inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_channelwise_mkldnn ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}/ResNet50_qat_channelwise ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	# Quant2 MobileNetV1
         set(QUANT2_MOBILENETV1_MODEL_DIR "${QUANT_INSTALL_DIR}/MobileNetV1_quant2")
 	set(QUANT2_MOBILENETV1_MODEL_ARCHIVE "MobileNet_qat_perf.tar.gz")
-	download_quant_model(${QUANT2_MOBILENETV1_MODEL_DIR} ${QUANT2_MOBILENETV1_MODEL_ARCHIVE})
+	download_quant_model(${QUANT2_MOBILENETV1_MODEL_DIR} ${QUANT2_MOBILENETV1_MODEL_ARCHIVE} 7f626e453db2d56fed6c2538621ffacf)
 	set(FP32_MOBILENETV1_MODEL_DIR "${INT8_INSTALL_DIR}/mobilenetv1")
 	inference_quant2_int8_image_classification_test(test_quant2_int8_mobilenetv1_mkldnn ${QUANT2_MOBILENETV1_MODEL_DIR}/MobileNet_qat_perf/float ${FP32_MOBILENETV1_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 	
@@ -225,22 +225,22 @@ if(LINUX AND WITH_MKLDNN)
 	set(NLP_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie_dataset")
 	set(NLP_DATA_PATH "${NLP_DATA_DIR}/Ernie_dataset/1.8w.bs1")
 	set(NLP_LABLES_PATH "${NLP_DATA_DIR}/Ernie_dataset/label.xnli.dev")
-	download_quant_data(${NLP_DATA_DIR} ${NLP_DATA_ARCHIVE})
+	download_quant_data(${NLP_DATA_DIR} ${NLP_DATA_ARCHIVE} e650ce0cbc1fadbed5cc2c01d4e734dc)
 
 	# Quant2 Ernie
 	set(QUANT2_ERNIE_MODEL_ARCHIVE "ernie_qat.tar.gz")
 	set(QUANT2_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_quant2")
-	download_quant_model(${QUANT2_ERNIE_MODEL_DIR} ${QUANT2_ERNIE_MODEL_ARCHIVE})
+	download_quant_model(${QUANT2_ERNIE_MODEL_DIR} ${QUANT2_ERNIE_MODEL_ARCHIVE} f7cdf4720755ecf66efbc8044e9922d9)
 	set(FP32_ERNIE_MODEL_ARCHIVE "ernie_fp32_model.tar.gz")
 	set(FP32_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_float")
-	download_quant_fp32_model(${FP32_ERNIE_MODEL_DIR} ${FP32_ERNIE_MODEL_ARCHIVE})
+	download_quant_fp32_model(${FP32_ERNIE_MODEL_DIR} ${FP32_ERNIE_MODEL_ARCHIVE} 114f38804a3ef8c45e7259e68bbd838b)
 	set(QUANT2_ERNIE_OPS_TO_QUANTIZE "fc,reshape2,transpose2,matmul,elementwise_add")
 	inference_quant2_int8_nlp_test(test_quant2_int8_ernie_mkldnn ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH} ${NLP_LABLES_PATH} ${QUANT2_ERNIE_OPS_TO_QUANTIZE})
 
 	# Quant2 GRU
 	set(QUANT2_GRU_MODEL_ARCHIVE "GRU_quant_acc.tar.gz")
 	set(QUANT2_GRU_MODEL_DIR "${QUANT_INSTALL_DIR}/GRU_quant2")
-	download_quant_model(${QUANT2_GRU_MODEL_DIR} ${QUANT2_GRU_MODEL_ARCHIVE})
+	download_quant_model(${QUANT2_GRU_MODEL_DIR} ${QUANT2_GRU_MODEL_ARCHIVE} cf207f8076dcfb8b74d8b6bdddf9090c)
 	set(QUANT2_GRU_OPS_TO_QUANTIZE "multi_gru")
 
 	### Save FP32 model or INT8 model from Quant model
@@ -270,12 +270,6 @@ list(REMOVE_ITEM TEST_OPS
 #TODO(wanghaoshuang): Fix this unitest failed on GCC8.
 LIST(REMOVE_ITEM TEST_OPS test_auto_pruning)
 LIST(REMOVE_ITEM TEST_OPS test_filter_pruning)
-
-# only tests on singal GPU environment
-LIST(REMOVE_ITEM TEST_OPS test_imperative_qat_addquantdequant)
-
-py_test_modules(test_imperative_qat_addquantdequant MODULES test_imperative_qat_addquantdequant ENVS
-	CUDA_VISIBLE_DEVICES=0)
 	
 # fix
 if(WIN32)
@@ -313,7 +307,6 @@ set_tests_properties(test_quantization_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 120)
 set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_qat_addquantdequant PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 120)
 if(LINUX AND WITH_MKLDNN)
     set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py b/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py
new file mode 100644
index 00000000000000..cc26f6a88f2e0f
--- /dev/null
+++ b/python/paddle/fluid/contrib/slim/tests/imperative_test_utils.py
@@ -0,0 +1,224 @@
+#   copyright (c) 2021 paddlepaddle authors. all rights reserved.
+#
+# licensed under the apache license, version 2.0 (the "license");
+# you may not use this file except in compliance with the license.
+# you may obtain a copy of the license at
+#
+#     http://www.apache.org/licenses/license-2.0
+#
+# unless required by applicable law or agreed to in writing, software
+# distributed under the license is distributed on an "as is" basis,
+# without warranties or conditions of any kind, either express or implied.
+# see the license for the specific language governing permissions and
+# limitations under the license.
+import numpy as np
+import logging
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.dygraph.container import Sequential
+from paddle.nn import ReLU, ReLU6, LeakyReLU, Sigmoid, Softmax, PReLU
+from paddle.nn import Linear, Conv2D, Softmax, BatchNorm2D, MaxPool2D
+
+from paddle.fluid.log_helper import get_logger
+
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
+
+
+def fix_model_dict(model):
+    fixed_state = {}
+    for name, param in model.named_parameters():
+        p_shape = param.numpy().shape
+        p_value = param.numpy()
+        if name.endswith("bias"):
+            value = np.zeros_like(p_value).astype('float32')
+        else:
+            value = np.random.normal(
+                loc=0.0, scale=0.01,
+                size=np.product(p_shape)).reshape(p_shape).astype('float32')
+        fixed_state[name] = value
+    model.set_dict(fixed_state)
+    return model
+
+
+def train_lenet(lenet, reader, optimizer):
+    loss_list = []
+    lenet.train()
+
+    for batch_id, data in enumerate(reader()):
+        x_data = np.array([x[0].reshape(1, 28, 28)
+                           for x in data]).astype('float32')
+        y_data = np.array([x[1] for x in data]).astype('int64').reshape(-1, 1)
+
+        img = paddle.to_tensor(x_data)
+        label = paddle.to_tensor(y_data)
+
+        out = lenet(img)
+        loss = fluid.layers.cross_entropy(out, label)
+        avg_loss = fluid.layers.mean(loss)
+        avg_loss.backward()
+
+        optimizer.minimize(avg_loss)
+        lenet.clear_gradients()
+
+        if batch_id % 100 == 0:
+            loss_list.append(avg_loss.numpy()[0])
+            _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
+
+    return loss_list
+
+
+class ImperativeLenet(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10):
+        super(ImperativeLenet, self).__init__()
+        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
+        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
+        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
+        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
+        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
+        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
+        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
+        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
+        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+        self.features = Sequential(
+            Conv2D(
+                in_channels=1,
+                out_channels=6,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                weight_attr=conv2d_w1_attr,
+                bias_attr=False),
+            BatchNorm2D(6),
+            ReLU(),
+            MaxPool2D(
+                kernel_size=2, stride=2),
+            Conv2D(
+                in_channels=6,
+                out_channels=16,
+                kernel_size=5,
+                stride=1,
+                padding=0,
+                weight_attr=conv2d_w2_attr,
+                bias_attr=conv2d_b2_attr),
+            BatchNorm2D(16),
+            PReLU(),
+            MaxPool2D(
+                kernel_size=2, stride=2))
+
+        self.fc = Sequential(
+            Linear(
+                in_features=400,
+                out_features=120,
+                weight_attr=fc_w1_attr,
+                bias_attr=fc_b1_attr),
+            LeakyReLU(),
+            Linear(
+                in_features=120,
+                out_features=84,
+                weight_attr=fc_w2_attr,
+                bias_attr=fc_b2_attr),
+            Sigmoid(),
+            Linear(
+                in_features=84,
+                out_features=num_classes,
+                weight_attr=fc_w3_attr,
+                bias_attr=fc_b3_attr),
+            Softmax())
+        self.add = paddle.nn.quant.add()
+
+    def forward(self, inputs):
+        x = self.features(inputs)
+
+        x = fluid.layers.flatten(x, 1)
+        x = self.add(x, paddle.to_tensor(0.0))  # For CI
+        x = self.fc(x)
+        return x
+
+
+class ImperativeLenetWithSkipQuant(fluid.dygraph.Layer):
+    def __init__(self, num_classes=10):
+        super(ImperativeLenetWithSkipQuant, self).__init__()
+
+        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
+        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
+        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
+        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
+        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
+        conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
+        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
+        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
+        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
+        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
+        self.conv2d_0 = Conv2D(
+            in_channels=1,
+            out_channels=6,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            weight_attr=conv2d_w1_attr,
+            bias_attr=conv2d_b1_attr)
+        self.conv2d_0.skip_quant = True
+
+        self.batch_norm_0 = BatchNorm2D(6)
+        self.relu_0 = ReLU()
+        self.pool2d_0 = MaxPool2D(kernel_size=2, stride=2)
+        self.conv2d_1 = Conv2D(
+            in_channels=6,
+            out_channels=16,
+            kernel_size=5,
+            stride=1,
+            padding=0,
+            weight_attr=conv2d_w2_attr,
+            bias_attr=conv2d_b2_attr)
+        self.conv2d_1.skip_quant = False
+
+        self.batch_norm_1 = BatchNorm2D(16)
+        self.relu6_0 = ReLU6()
+        self.pool2d_1 = MaxPool2D(kernel_size=2, stride=2)
+        self.linear_0 = Linear(
+            in_features=400,
+            out_features=120,
+            weight_attr=fc_w1_attr,
+            bias_attr=fc_b1_attr)
+        self.linear_0.skip_quant = True
+
+        self.leaky_relu_0 = LeakyReLU()
+        self.linear_1 = Linear(
+            in_features=120,
+            out_features=84,
+            weight_attr=fc_w2_attr,
+            bias_attr=fc_b2_attr)
+        self.linear_1.skip_quant = False
+
+        self.sigmoid_0 = Sigmoid()
+        self.linear_2 = Linear(
+            in_features=84,
+            out_features=num_classes,
+            weight_attr=fc_w3_attr,
+            bias_attr=fc_b3_attr)
+        self.linear_2.skip_quant = False
+        self.softmax_0 = Softmax()
+
+    def forward(self, inputs):
+        x = self.conv2d_0(inputs)
+        x = self.batch_norm_0(x)
+        x = self.relu_0(x)
+        x = self.pool2d_0(x)
+        x = self.conv2d_1(x)
+        x = self.batch_norm_1(x)
+        x = self.relu6_0(x)
+        x = self.pool2d_1(x)
+
+        x = fluid.layers.flatten(x, 1)
+
+        x = self.linear_0(x)
+        x = self.leaky_relu_0(x)
+        x = self.linear_1(x)
+        x = self.sigmoid_0(x)
+        x = self.linear_2(x)
+        x = self.softmax_0(x)
+
+        return x
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
index 8d6ce76ef0fa5f..6cc58a38f227a5 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -28,7 +28,6 @@
 from paddle.fluid.optimizer import AdamOptimizer
 from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
-from paddle.fluid.contrib.slim.quantization import OutScaleForTrainingPass, OutScaleForInferencePass, QuantizationTransformPass
 from paddle.fluid.dygraph.container import Sequential
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, PReLU
@@ -36,6 +35,8 @@
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.dygraph import nn
 
+from imperative_test_utils import fix_model_dict, train_lenet, ImperativeLenet
+
 paddle.enable_static()
 
 os.environ["CPU_NUM"] = "1"
@@ -54,59 +55,6 @@ def get_vaild_warning_num(warning, w):
     return num
 
 
-def StaticLenet(data, num_classes=10, classifier_activation='softmax'):
-    conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-    conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-    fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-    fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-    fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-    conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-    fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-    fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-    fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-    conv1 = fluid.layers.conv2d(
-        data,
-        num_filters=6,
-        filter_size=3,
-        stride=1,
-        padding=1,
-        param_attr=conv2d_w1_attr,
-        bias_attr=False)
-    batch_norm1 = layers.batch_norm(conv1)
-    relu1 = layers.relu(batch_norm1)
-    pool1 = fluid.layers.pool2d(
-        relu1, pool_size=2, pool_type='max', pool_stride=2)
-    conv2 = fluid.layers.conv2d(
-        pool1,
-        num_filters=16,
-        filter_size=5,
-        stride=1,
-        padding=0,
-        param_attr=conv2d_w2_attr,
-        bias_attr=conv2d_b2_attr)
-    batch_norm2 = layers.batch_norm(conv2)
-    prelu1 = layers.prelu(batch_norm2, mode='all')
-    pool2 = fluid.layers.pool2d(
-        prelu1, pool_size=2, pool_type='max', pool_stride=2)
-
-    fc1 = fluid.layers.fc(input=pool2,
-                          size=120,
-                          param_attr=fc_w1_attr,
-                          bias_attr=fc_b1_attr)
-    leaky_relu1 = layers.leaky_relu(fc1, alpha=0.01)
-    fc2 = fluid.layers.fc(input=leaky_relu1,
-                          size=84,
-                          param_attr=fc_w2_attr,
-                          bias_attr=fc_b2_attr)
-    sigmoid1 = layers.sigmoid(fc2)
-    fc3 = fluid.layers.fc(input=sigmoid1,
-                          size=num_classes,
-                          param_attr=fc_w3_attr,
-                          bias_attr=fc_b3_attr)
-    softmax1 = layers.softmax(fc3, use_cudnn=True)
-    return softmax1
-
-
 class ImperativeLenet(fluid.dygraph.Layer):
     def __init__(self, num_classes=10):
         super(ImperativeLenet, self).__init__()
@@ -175,38 +123,11 @@ def forward(self, inputs):
 
 class TestImperativeOutSclae(unittest.TestCase):
     def test_out_scale_acc(self):
-        def _build_static_lenet(main, startup, is_test=False, seed=1000):
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, startup):
-                    main.random_seed = seed
-                    startup.random_seed = seed
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32')
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64')
-                    prediction = StaticLenet(img)
-                    if not is_test:
-                        loss = fluid.layers.cross_entropy(
-                            input=prediction, label=label)
-                        avg_loss = fluid.layers.mean(loss)
-                    else:
-                        avg_loss = prediction
-            return img, label, avg_loss
-
-        reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
-        weight_quantize_type = 'abs_max'
-        activation_quantize_type = 'moving_average_abs_max'
-        param_init_map = {}
         seed = 1000
         lr = 0.001
-        dynamic_out_scale_list = []
-        static_out_scale_list = []
 
-        # imperative train
-        _logger.info(
-            "--------------------------dynamic graph qat--------------------------"
-        )
+        weight_quantize_type = 'abs_max'
+        activation_quantize_type = 'moving_average_abs_max'
         imperative_out_scale = ImperativeQuantAware(
             weight_quantize_type=weight_quantize_type,
             activation_quantize_type=activation_quantize_type)
@@ -215,207 +136,46 @@ def _build_static_lenet(main, startup, is_test=False, seed=1000):
             np.random.seed(seed)
             fluid.default_main_program().random_seed = seed
             fluid.default_startup_program().random_seed = seed
+
             lenet = ImperativeLenet()
-            fixed_state = {}
-            for name, param in lenet.named_parameters():
-                p_shape = param.numpy().shape
-                p_value = param.numpy()
-                if name.endswith("bias"):
-                    value = np.zeros_like(p_value).astype('float32')
-                else:
-                    value = np.random.normal(
-                        loc=0.0, scale=0.01, size=np.product(p_shape)).reshape(
-                            p_shape).astype('float32')
-                fixed_state[name] = value
-                param_init_map[param.name] = value
-            lenet.set_dict(fixed_state)
+            lenet = fix_model_dict(lenet)
             imperative_out_scale.quantize(lenet)
+
+            reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
             adam = AdamOptimizer(
                 learning_rate=lr, parameter_list=lenet.parameters())
-            dynamic_loss_rec = []
-            lenet.train()
-            for batch_id, data in enumerate(reader()):
-                x_data = np.array([x[0].reshape(1, 28, 28)
-                                   for x in data]).astype('float32')
-                y_data = np.array(
-                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-                img = fluid.dygraph.to_variable(x_data)
-                label = fluid.dygraph.to_variable(y_data)
-
-                out = lenet(img)
-                loss = fluid.layers.cross_entropy(out, label)
-                avg_loss = fluid.layers.mean(loss)
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                lenet.clear_gradients()
-                dynamic_loss_rec.append(avg_loss.numpy()[0])
-                if batch_id % 100 == 0:
-                    _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
-
+            loss_list = train_lenet(lenet, reader, adam)
             lenet.eval()
 
         param_save_path = "test_save_quantized_model/lenet.pdparams"
         save_dict = lenet.state_dict()
         paddle.save(save_dict, param_save_path)
 
-        path = "./dynamic_outscale_infer_model/lenet"
-        dynamic_save_dir = "./dynamic_outscale_infer_model"
-
+        save_path = "./dynamic_outscale_infer_model/lenet"
         imperative_out_scale.save_quantized_model(
             layer=lenet,
-            path=path,
+            path=save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
             ])
 
-        _logger.info(
-            "--------------------------static graph qat--------------------------"
-        )
-        static_loss_rec = []
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-
-        main = fluid.Program()
-        infer = fluid.Program()
-        startup = fluid.Program()
-        static_img, static_label, static_loss = _build_static_lenet(
-            main, startup, False, seed)
-        infer_img, _, infer_pre = _build_static_lenet(infer, startup, True,
-                                                      seed)
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                opt = AdamOptimizer(learning_rate=lr)
-                opt.minimize(static_loss)
-
-        scope = core.Scope()
-        with fluid.scope_guard(scope):
-            exe.run(startup)
-        for param in main.all_parameters():
-            if "batch_norm" in param.name:
-                param_name = param.name.replace("norm", "norm2d")
-            elif 'prelu' in param.name:
-                param_name = param.name.replace("prelu", 'p_re_lu')
-            else:
-                param_name = param.name
-            param_tensor = scope.var(param.name).get_tensor()
-            param_tensor.set(param_init_map[param_name], place)
-        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
-        infer_graph = IrGraph(core.Graph(infer.desc), for_test=True)
-        transform_pass = QuantizationTransformPass(
-            scope=scope,
-            place=place,
-            activation_quantize_type=activation_quantize_type,
-            weight_quantize_type=weight_quantize_type,
-            quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'])
-        transform_pass.apply(main_graph)
-        transform_pass.apply(infer_graph)
-        outscale_pass = OutScaleForTrainingPass(scope=scope, place=place)
-        outscale_pass.apply(main_graph)
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.fuse_all_reduce_ops = False
-        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
-            loss_name=static_loss.name, build_strategy=build_strategy)
-
-        feeder = fluid.DataFeeder(
-            feed_list=[static_img, static_label], place=place)
-        with fluid.scope_guard(scope):
-            for batch_id, data in enumerate(reader()):
-                loss_v, = exe.run(binary,
-                                  feed=feeder.feed(data),
-                                  fetch_list=[static_loss])
-                static_loss_rec.append(loss_v[0])
-                if batch_id % 100 == 0:
-                    _logger.info('{}: {}'.format('loss', loss_v))
-        scale_inference_pass = OutScaleForInferencePass(scope=scope)
-        scale_inference_pass.apply(infer_graph)
-
-        save_program = infer_graph.to_program()
-        static_save_dir = "./static_outscale_infer_model"
-        with fluid.scope_guard(scope):
-            fluid.io.save_inference_model(
-                dirname=static_save_dir,
-                feeded_var_names=[infer_img.name],
-                target_vars=[infer_pre],
-                executor=exe,
-                main_program=save_program,
-                model_filename="lenet" + INFER_MODEL_SUFFIX,
-                params_filename="lenet" + INFER_PARAMS_SUFFIX)
-
-        rtol = 1e-05
-        atol = 1e-08
-        for i, (loss_d,
-                loss_s) in enumerate(zip(dynamic_loss_rec, static_loss_rec)):
-            diff = np.abs(loss_d - loss_s)
-            if diff > (atol + rtol * np.abs(loss_s)):
-                _logger.info(
-                    "diff({}) at {}, dynamic loss = {}, static loss = {}".
-                    format(diff, i, loss_d, loss_s))
-                break
-        self.assertTrue(
-            np.allclose(
-                np.array(dynamic_loss_rec),
-                np.array(static_loss_rec),
-                rtol=rtol,
-                atol=atol,
-                equal_nan=True),
-            msg='Failed to do the imperative qat.')
-
-        # load dynamic model
-        [dynamic_inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=dynamic_save_dir,
-                executor=exe,
-                model_filename="lenet" + INFER_MODEL_SUFFIX,
-                params_filename="lenet" + INFER_PARAMS_SUFFIX))
-        # load static model
-        [static_inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=static_save_dir,
-                executor=exe,
-                model_filename="lenet" + INFER_MODEL_SUFFIX,
-                params_filename="lenet" + INFER_PARAMS_SUFFIX))
-
-        dynamic_ops = dynamic_inference_program.global_block().ops
-        static_ops = static_inference_program.global_block().ops
-
-        for op in dynamic_ops[:]:
-            if op.type == "flatten2" or 'fake' in op.type:
-                dynamic_ops.remove(op)
-
-        for op in static_ops[:]:
-            if 'fake' in op.type:
-                static_ops.remove(op)
-
-        op_count = 0
-        for i in range(len(dynamic_ops)):
-            if dynamic_ops[i].has_attr("out_threshold"):
-                op_count += 1
-                self.assertTrue(dynamic_ops[i].type == static_ops[i].type)
-                if dynamic_ops[i].attr("out_threshold") != static_ops[i].attr(
-                        "out_threshold"):
-                    _logger.info(dynamic_ops[i].attr("out_threshold"))
-                    _logger.info(static_ops[i].attr("out_threshold"))
-                self.assertTrue(dynamic_ops[i].attr("out_threshold") ==
-                                static_ops[i].attr("out_threshold"))
-
-        _logger.info("op_cout: {}".format(op_count))
-        self.assertTrue(op_count == 14)
+        for i in range(len(loss_list) - 1):
+            self.assertTrue(
+                loss_list[i] > loss_list[i + 1],
+                msg='Failed to do the imperative qat.')
 
 
 class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
     def test_save_quantized_model(self):
-        weight_quantize_type = 'abs_max'
-        activation_quantize_type = 'moving_average_abs_max'
+        lr = 0.001
+
         load_param_path = "test_save_quantized_model/lenet.pdparams"
-        path = "./dynamic_outscale_infer_model_from_checkpoint/lenet"
-        dynamic_model_save_dir = "./dynamic_outscale_infer_model_from_checkpoint"
-        static_model_save_dir = "./static_outscale_infer_model"
+        save_path = "./dynamic_outscale_infer_model_from_checkpoint/lenet"
 
+        weight_quantize_type = 'abs_max'
+        activation_quantize_type = 'moving_average_abs_max'
         imperative_out_scale = ImperativeQuantAware(
             weight_quantize_type=weight_quantize_type,
             activation_quantize_type=activation_quantize_type)
@@ -426,56 +186,25 @@ def test_save_quantized_model(self):
             imperative_out_scale.quantize(lenet)
             lenet.set_dict(load_dict)
 
+            reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
+            adam = AdamOptimizer(
+                learning_rate=lr, parameter_list=lenet.parameters())
+            loss_list = train_lenet(lenet, reader, adam)
+            lenet.eval()
+
         imperative_out_scale.save_quantized_model(
             layer=lenet,
-            path=path,
+            path=save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
             ])
 
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-
-        # load dynamic model
-        [dynamic_inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=dynamic_model_save_dir,
-                executor=exe,
-                model_filename="lenet" + INFER_MODEL_SUFFIX,
-                params_filename="lenet" + INFER_PARAMS_SUFFIX))
-        # load static model
-        [static_inference_program, feed_target_names, fetch_targets] = (
-            fluid.io.load_inference_model(
-                dirname=static_model_save_dir,
-                executor=exe,
-                model_filename="lenet" + INFER_MODEL_SUFFIX,
-                params_filename="lenet" + INFER_PARAMS_SUFFIX))
-
-        dynamic_ops = dynamic_inference_program.global_block().ops
-        static_ops = static_inference_program.global_block().ops
-
-        for op in dynamic_ops[:]:
-            if op.type == "flatten2" or 'fake' in op.type:
-                dynamic_ops.remove(op)
-
-        for op in static_ops[:]:
-            if 'fake' in op.type:
-                static_ops.remove(op)
-
-        op_count = 0
-        for i in range(len(dynamic_ops)):
-            if dynamic_ops[i].has_attr("out_threshold"):
-                op_count += 1
-                self.assertTrue(dynamic_ops[i].type == static_ops[i].type)
-                self.assertTrue(dynamic_ops[i].attr("out_threshold") ==
-                                static_ops[i].attr("out_threshold"))
-
-        _logger.info("op_cout: {}".format(op_count))
-        self.assertTrue(op_count == 14)
+        for i in range(len(loss_list) - 1):
+            self.assertTrue(
+                loss_list[i] > loss_list[i + 1],
+                msg='Failed to do the imperative qat.')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 99a23525409f37..bf411e5b38efae 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -21,20 +21,20 @@
 import time
 import unittest
 import logging
+
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.framework import IrGraph
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
-from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
 from paddle.fluid.dygraph.container import Sequential
 from paddle.nn import Linear, Conv2D, Softmax
-from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.fluid.contrib.slim.quantization.imperative.quant_nn import QuantizedConv2D
 
+from imperative_test_utils import fix_model_dict, ImperativeLenet
+
 paddle.enable_static()
 
 os.environ["CPU_NUM"] = "1"
@@ -45,115 +45,6 @@
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
-def StaticLenet(data, num_classes=10):
-    conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-    conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-    fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-    fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-    fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-    conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
-    conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-    fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-    fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-    fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-    conv1 = fluid.layers.conv2d(
-        data,
-        num_filters=6,
-        filter_size=3,
-        stride=1,
-        padding=1,
-        param_attr=conv2d_w1_attr,
-        bias_attr=conv2d_b1_attr)
-    pool1 = fluid.layers.pool2d(
-        conv1, pool_size=2, pool_type='max', pool_stride=2)
-    conv2 = fluid.layers.conv2d(
-        pool1,
-        num_filters=16,
-        filter_size=5,
-        stride=1,
-        padding=0,
-        param_attr=conv2d_w2_attr,
-        bias_attr=conv2d_b2_attr)
-    pool2 = fluid.layers.pool2d(
-        conv2, pool_size=2, pool_type='max', pool_stride=2)
-
-    fc1 = fluid.layers.fc(input=pool2,
-                          size=120,
-                          param_attr=fc_w1_attr,
-                          bias_attr=fc_b1_attr)
-    fc2 = fluid.layers.fc(input=fc1,
-                          size=84,
-                          param_attr=fc_w2_attr,
-                          bias_attr=fc_b2_attr)
-    fc3 = fluid.layers.fc(input=fc2,
-                          size=num_classes,
-                          param_attr=fc_w3_attr,
-                          bias_attr=fc_b3_attr)
-    fc4 = fluid.layers.softmax(fc3, use_cudnn=True)
-
-    return fc4
-
-
-class ImperativeLenet(fluid.dygraph.Layer):
-    def __init__(self, num_classes=10):
-        super(ImperativeLenet, self).__init__()
-        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-        conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
-        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-        self.features = Sequential(
-            Conv2D(
-                in_channels=1,
-                out_channels=6,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=conv2d_w1_attr,
-                bias_attr=conv2d_b1_attr),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2),
-            Conv2D(
-                in_channels=6,
-                out_channels=16,
-                kernel_size=5,
-                stride=1,
-                padding=0,
-                weight_attr=conv2d_w2_attr,
-                bias_attr=conv2d_b2_attr),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2))
-
-        self.fc = Sequential(
-            Linear(
-                in_features=400,
-                out_features=120,
-                weight_attr=fc_w1_attr,
-                bias_attr=fc_b1_attr),
-            Linear(
-                in_features=120,
-                out_features=84,
-                weight_attr=fc_w2_attr,
-                bias_attr=fc_b2_attr),
-            Linear(
-                in_features=84,
-                out_features=num_classes,
-                weight_attr=fc_w3_attr,
-                bias_attr=fc_b3_attr),
-            Softmax())
-
-    def forward(self, inputs):
-        x = self.features(inputs)
-        x = fluid.layers.flatten(x, 1)
-        x = self.fc(x)
-        return x
-
-
 class TestImperativeQat(unittest.TestCase):
     """
     QAT = quantization-aware training
@@ -164,19 +55,26 @@ def setUpClass(cls):
         timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
         cls.root_path = os.path.join(os.getcwd(), "imperative_qat_" + timestamp)
         cls.save_path = os.path.join(cls.root_path, "lenet")
-        cls.dynamic_root_path = os.path.join(os.getcwd(),
-                                             "dynamic_mnist_" + timestamp)
-        cls.dynamic_save_path = os.path.join(cls.dynamic_root_path, "model")
 
     @classmethod
     def tearDownClass(cls):
-        shutil.rmtree(cls.root_path)
-        shutil.rmtree(cls.dynamic_root_path)
+        try:
+            shutil.rmtree(cls.root_path)
+        except Exception as e:
+            print("Failed to delete {} due to {}".format(cls.root_path, str(e)))
+
+    def set_vars(self):
+        self.weight_quantize_type = None
+        self.activation_quantize_type = None
+        print('weight_quantize_type', self.weight_quantize_type)
+
+    def run_qat_save(self):
+        self.set_vars()
 
-    def test_qat_save(self):
         imperative_qat = ImperativeQuantAware(
-            weight_quantize_type='abs_max',
-            activation_quantize_type='moving_average_abs_max')
+            weight_quantize_type=self.weight_quantize_type,
+            activation_quantize_type=self.activation_quantize_type)
+
         with fluid.dygraph.guard():
             # For CI coverage
             conv1 = Conv2D(
@@ -190,10 +88,17 @@ def test_qat_save(self):
             data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
             quant_conv1(fluid.dygraph.to_variable(data))
 
+            seed = 1
+            np.random.seed(seed)
+            fluid.default_main_program().random_seed = seed
+            fluid.default_startup_program().random_seed = seed
+
             lenet = ImperativeLenet()
+            lenet = fix_model_dict(lenet)
             imperative_qat.quantize(lenet)
             adam = AdamOptimizer(
                 learning_rate=0.001, parameter_list=lenet.parameters())
+
             train_reader = paddle.batch(
                 paddle.dataset.mnist.train(), batch_size=32, drop_last=True)
             test_reader = paddle.batch(
@@ -226,6 +131,7 @@ def test_qat_save(self):
                         break
 
                 lenet.eval()
+                eval_acc_top1_list = []
                 for batch_id, data in enumerate(test_reader()):
                     x_data = np.array([x[0].reshape(1, 28, 28)
                                        for x in data]).astype('float32')
@@ -242,14 +148,19 @@ def test_qat_save(self):
                         input=out, label=label, k=5)
 
                     if batch_id % 100 == 0:
+                        eval_acc_top1_list.append(float(acc_top1.numpy()))
                         _logger.info(
                             "Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}".
                             format(epoch, batch_id,
                                    acc_top1.numpy(), acc_top5.numpy()))
 
-            # save weights
-            model_dict = lenet.state_dict()
-            fluid.save_dygraph(model_dict, "save_temp")
+                # check eval acc
+                eval_acc_top1 = sum(eval_acc_top1_list) / len(
+                    eval_acc_top1_list)
+                print('eval_acc_top1', eval_acc_top1)
+                self.assertTrue(
+                    eval_acc_top1 > 0.9,
+                    msg="The test acc {%f} is less than 0.9." % eval_acc_top1)
 
             # test the correctness of `paddle.jit.save`
             data = next(test_reader())
@@ -260,13 +171,14 @@ def test_qat_save(self):
             before_save = lenet(test_img)
 
         # save inference quantized model
-        paddle.jit.save(
+        imperative_qat.save_quantized_model(
             layer=lenet,
-            path=TestImperativeQat.save_path,
+            path=self.save_path,
             input_spec=[
                 paddle.static.InputSpec(
                     shape=[None, 1, 28, 28], dtype='float32')
             ])
+        print('Quantized model saved in {%s}' % self.save_path)
 
         if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
@@ -275,183 +187,27 @@ def test_qat_save(self):
         exe = fluid.Executor(place)
         [inference_program, feed_target_names,
          fetch_targets] = fluid.io.load_inference_model(
-             dirname=TestImperativeQat.root_path,
+             dirname=self.root_path,
              executor=exe,
              model_filename="lenet" + INFER_MODEL_SUFFIX,
              params_filename="lenet" + INFER_PARAMS_SUFFIX)
         after_save, = exe.run(inference_program,
                               feed={feed_target_names[0]: test_data},
                               fetch_list=fetch_targets)
-
+        # check
         self.assertTrue(
             np.allclose(after_save, before_save.numpy()),
             msg='Failed to save the inference quantized model.')
 
-    def test_qat_acc(self):
-        def _build_static_lenet(main, startup, is_test=False, seed=1000):
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, startup):
-                    main.random_seed = seed
-                    startup.random_seed = seed
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32')
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64')
-                    prediction = StaticLenet(img)
-                    if not is_test:
-                        loss = fluid.layers.cross_entropy(
-                            input=prediction, label=label)
-                        avg_loss = fluid.layers.mean(loss)
-                    else:
-                        avg_loss = prediction
-            return img, label, avg_loss
-
-        reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
-        weight_quantize_type = 'abs_max'
-        activation_quant_type = 'moving_average_abs_max'
-        param_init_map = {}
-        seed = 1000
-        lr = 0.01
-
-        # imperative train
-        _logger.info(
-            "--------------------------dynamic graph qat--------------------------"
-        )
-        imperative_qat = ImperativeQuantAware(
-            weight_quantize_type=weight_quantize_type,
-            activation_quantize_type=activation_quant_type)
 
-        with fluid.dygraph.guard():
-            np.random.seed(seed)
-            fluid.default_main_program().random_seed = seed
-            fluid.default_startup_program().random_seed = seed
-            lenet = ImperativeLenet()
-            fixed_state = {}
-            for name, param in lenet.named_parameters():
-                p_shape = param.numpy().shape
-                p_value = param.numpy()
-                if name.endswith("bias"):
-                    value = np.zeros_like(p_value).astype('float32')
-                else:
-                    value = np.random.normal(
-                        loc=0.0, scale=0.01, size=np.product(p_shape)).reshape(
-                            p_shape).astype('float32')
-                fixed_state[name] = value
-                param_init_map[param.name] = value
-            lenet.set_dict(fixed_state)
+class TestImperativeQatAbsMax(TestImperativeQat):
+    def set_vars(self):
+        self.weight_quantize_type = 'abs_max'
+        self.activation_quantize_type = 'moving_average_abs_max'
+        print('weight_quantize_type', self.weight_quantize_type)
 
-            imperative_qat.quantize(lenet)
-            adam = AdamOptimizer(
-                learning_rate=lr, parameter_list=lenet.parameters())
-            dynamic_loss_rec = []
-            lenet.train()
-            for batch_id, data in enumerate(reader()):
-                x_data = np.array([x[0].reshape(1, 28, 28)
-                                   for x in data]).astype('float32')
-                y_data = np.array(
-                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-                img = fluid.dygraph.to_variable(x_data)
-                label = fluid.dygraph.to_variable(y_data)
-
-                out = lenet(img)
-                loss = fluid.layers.cross_entropy(out, label)
-                avg_loss = fluid.layers.mean(loss)
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                lenet.clear_gradients()
-                dynamic_loss_rec.append(avg_loss.numpy()[0])
-                if batch_id % 100 == 0:
-                    _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
-
-        paddle.jit.save(
-            layer=lenet,
-            path=TestImperativeQat.dynamic_save_path,
-            input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, 1, 28, 28], dtype='float32')
-            ])
-
-        # static graph train
-        _logger.info(
-            "--------------------------static graph qat--------------------------"
-        )
-        static_loss_rec = []
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-
-        main = fluid.Program()
-        infer = fluid.Program()
-        startup = fluid.Program()
-        static_img, static_label, static_loss = _build_static_lenet(
-            main, startup, False, seed)
-        infer_img, _, infer_pre = _build_static_lenet(infer, startup, True,
-                                                      seed)
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                opt = AdamOptimizer(learning_rate=lr)
-                opt.minimize(static_loss)
-
-        scope = core.Scope()
-        with fluid.scope_guard(scope):
-            exe.run(startup)
-        for param in main.all_parameters():
-            param_tensor = scope.var(param.name).get_tensor()
-            param_tensor.set(param_init_map[param.name], place)
-
-        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
-        infer_graph = IrGraph(core.Graph(infer.desc), for_test=True)
-        transform_pass = QuantizationTransformPass(
-            scope=scope,
-            place=place,
-            activation_quantize_type=activation_quant_type,
-            weight_quantize_type=weight_quantize_type,
-            quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'])
-        transform_pass.apply(main_graph)
-        transform_pass.apply(infer_graph)
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.fuse_all_reduce_ops = False
-        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
-            loss_name=static_loss.name, build_strategy=build_strategy)
-
-        feeder = fluid.DataFeeder(
-            feed_list=[static_img, static_label], place=place)
-        with fluid.scope_guard(scope):
-            for batch_id, data in enumerate(reader()):
-                loss_v, = exe.run(binary,
-                                  feed=feeder.feed(data),
-                                  fetch_list=[static_loss])
-                static_loss_rec.append(loss_v[0])
-                if batch_id % 100 == 0:
-                    _logger.info('{}: {}'.format('loss', loss_v))
-
-        save_program = infer_graph.to_program()
-        with fluid.scope_guard(scope):
-            fluid.io.save_inference_model("./static_mnist", [infer_img.name],
-                                          [infer_pre], exe, save_program)
-        rtol = 1e-05
-        atol = 1e-08
-        for i, (loss_d,
-                loss_s) in enumerate(zip(dynamic_loss_rec, static_loss_rec)):
-            diff = np.abs(loss_d - loss_s)
-            if diff > (atol + rtol * np.abs(loss_s)):
-                _logger.info(
-                    "diff({}) at {}, dynamic loss = {}, static loss = {}".
-                    format(diff, i, loss_d, loss_s))
-                break
-
-        self.assertTrue(
-            np.allclose(
-                np.array(dynamic_loss_rec),
-                np.array(static_loss_rec),
-                rtol=rtol,
-                atol=atol,
-                equal_nan=True),
-            msg='Failed to do the imperative qat.')
+    def test_qat(self):
+        self.run_qat_save()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
deleted file mode 100644
index f5b3e89ef415c1..00000000000000
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py
+++ /dev/null
@@ -1,494 +0,0 @@
-#   copyright (c) 2018 paddlepaddle authors. all rights reserved.
-#
-# licensed under the apache license, version 2.0 (the "license");
-# you may not use this file except in compliance with the license.
-# you may obtain a copy of the license at
-#
-#     http://www.apache.org/licenses/license-2.0
-#
-# unless required by applicable law or agreed to in writing, software
-# distributed under the license is distributed on an "as is" basis,
-# without warranties or conditions of any kind, either express or implied.
-# see the license for the specific language governing permissions and
-# limitations under the license.
-
-from __future__ import print_function
-
-import os
-import numpy as np
-import random
-import shutil
-import time
-import unittest
-import logging
-import paddle
-import six
-import paddle.fluid as fluid
-from paddle.nn import functional
-from paddle.nn import Linear, Conv2D, Softmax, BatchNorm
-from paddle.fluid.layers import nn
-from paddle.fluid import core
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware, QuantizationTransformPass, AddQuantDequantPass
-from paddle.fluid.dygraph.container import Sequential
-from paddle.fluid.dygraph.nn import Pool2D
-from paddle.nn.layer.activation import ReLU, LeakyReLU, ReLU6, Tanh, Swish
-from paddle.fluid.log_helper import get_logger
-from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
-
-paddle.enable_static()
-
-os.environ["CPU_NUM"] = "1"
-if core.is_compiled_with_cuda():
-    fluid.set_flags({"FLAGS_cudnn_deterministic": True})
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-def StaticLenet(data, num_classes=10):
-    conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-    conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-    conv2d_w3_attr = fluid.ParamAttr(name="conv2d_w_3")
-    fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-    fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-    fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-    conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
-    conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-    conv2d_b3_attr = fluid.ParamAttr(name="conv2d_b_3")
-    fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-    fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-    fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-
-    conv1 = fluid.layers.conv2d(
-        data,
-        num_filters=6,
-        filter_size=3,
-        stride=1,
-        padding=1,
-        param_attr=conv2d_w1_attr,
-        bias_attr=conv2d_b1_attr)
-    conv1 = fluid.layers.leaky_relu(conv1, alpha=0.02)
-    pool1 = fluid.layers.pool2d(
-        conv1, pool_size=2, pool_type='max', pool_stride=2)
-    conv2 = fluid.layers.conv2d(
-        pool1,
-        num_filters=16,
-        filter_size=5,
-        stride=1,
-        padding=0,
-        param_attr=conv2d_w2_attr,
-        bias_attr=conv2d_b2_attr)
-    pool2 = fluid.layers.pool2d(
-        conv2, pool_size=2, pool_type='max', pool_stride=2)
-    pool2 = fluid.layers.relu(pool2)
-    pool2 = fluid.layers.swish(pool2)
-    conv3 = fluid.layers.conv2d(
-        pool2,
-        num_filters=16,
-        filter_size=1,
-        stride=1,
-        padding=0,
-        param_attr=conv2d_w3_attr,
-        bias_attr=conv2d_b3_attr)
-    conv3 = fluid.layers.relu6(conv3)
-    conv3 = paddle.tensor.math.tanh(conv3)
-    fc1 = fluid.layers.fc(input=conv3,
-                          size=120,
-                          param_attr=fc_w1_attr,
-                          bias_attr=fc_b1_attr)
-    fc2 = fluid.layers.fc(input=fc1,
-                          size=84,
-                          param_attr=fc_w2_attr,
-                          bias_attr=fc_b2_attr)
-    fc3 = fluid.layers.fc(input=fc2,
-                          size=num_classes,
-                          param_attr=fc_w3_attr,
-                          bias_attr=fc_b3_attr)
-    fc3 = fluid.layers.softmax(fc3, use_cudnn=True)
-
-    return fc3
-
-
-class ImperativeLenet(fluid.dygraph.Layer):
-    def __init__(self, num_classes=10):
-        super(ImperativeLenet, self).__init__()
-        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-        conv2d_w3_attr = fluid.ParamAttr(name="conv2d_w_3")
-        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-        conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
-        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-        conv2d_b3_attr = fluid.ParamAttr(name="conv2d_b_3")
-        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-        self.features = Sequential(
-            Conv2D(
-                in_channels=1,
-                out_channels=6,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=conv2d_w1_attr,
-                bias_attr=conv2d_b1_attr),
-            LeakyReLU(negative_slope=0.02),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2),
-            Conv2D(
-                in_channels=6,
-                out_channels=16,
-                kernel_size=5,
-                stride=1,
-                padding=0,
-                weight_attr=conv2d_w2_attr,
-                bias_attr=conv2d_b2_attr),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2),
-            ReLU(),
-            Swish(),
-            Conv2D(
-                in_channels=16,
-                out_channels=16,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                weight_attr=conv2d_w3_attr,
-                bias_attr=conv2d_b3_attr),
-            ReLU6(),
-            Tanh())
-        self.fc = Sequential(
-            Linear(
-                in_features=400,
-                out_features=120,
-                weight_attr=fc_w1_attr,
-                bias_attr=fc_b1_attr),
-            Linear(
-                in_features=120,
-                out_features=84,
-                weight_attr=fc_w2_attr,
-                bias_attr=fc_b2_attr),
-            Linear(
-                in_features=84,
-                out_features=num_classes,
-                weight_attr=fc_w3_attr,
-                bias_attr=fc_b3_attr),
-            Softmax())
-
-    def forward(self, inputs):
-        x = self.features(inputs)
-        x = fluid.layers.flatten(x, 1)
-        x = self.fc(x)
-        return x
-
-
-class TestImperativeAddQuantDequant(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
-        cls.root_path = os.path.join(os.getcwd(),
-                                     "imperative_qat_aqd_" + timestamp)
-        cls.save_path = os.path.join(cls.root_path, "lenet")
-        cls.dynamic_root_path = os.path.join(os.getcwd(),
-                                             "dynamic_mnist_aqd_" + timestamp)
-        cls.dynamic_save_path = os.path.join(cls.dynamic_root_path, "model")
-
-    @classmethod
-    def tearDownClass(cls):
-        shutil.rmtree(cls.root_path)
-        shutil.rmtree(cls.dynamic_root_path)
-
-    def test_qat_save(self):
-
-        imperative_qat = ImperativeQuantAware(
-            weight_quantize_type='abs_max',
-            activation_quantize_type='moving_average_abs_max',
-            quantizable_layer_type=[
-                'Conv2D', 'Linear', 'ReLU', 'LeakyReLU', 'ReLU6', 'Tanh',
-                'Swish'
-            ])
-
-        with fluid.dygraph.guard():
-            lenet = ImperativeLenet()
-            imperative_qat.quantize(lenet)
-            adam = AdamOptimizer(
-                learning_rate=0.001, parameter_list=lenet.parameters())
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=32, drop_last=True)
-            test_reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=32)
-
-            epoch_num = 1
-            for epoch in range(epoch_num):
-                lenet.train()
-                for batch_id, data in enumerate(train_reader()):
-                    x_data = np.array([x[0].reshape(1, 28, 28)
-                                       for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-                    img = fluid.dygraph.to_variable(x_data)
-                    label = fluid.dygraph.to_variable(y_data)
-                    out = lenet(img)
-                    acc = fluid.layers.accuracy(out, label)
-                    loss = fluid.layers.cross_entropy(out, label)
-                    avg_loss = fluid.layers.mean(loss)
-                    avg_loss.backward()
-                    adam.minimize(avg_loss)
-                    lenet.clear_gradients()
-                    if batch_id % 100 == 0:
-                        _logger.info(
-                            "Train | At epoch {} step {}: loss = {:}, acc= {:}".
-                            format(epoch, batch_id,
-                                   avg_loss.numpy(), acc.numpy()))
-                    if batch_id == 500:  # For shortening CI time
-                        break
-
-                lenet.eval()
-                for batch_id, data in enumerate(test_reader()):
-                    x_data = np.array([x[0].reshape(1, 28, 28)
-                                       for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-                    img = fluid.dygraph.to_variable(x_data)
-                    label = fluid.dygraph.to_variable(y_data)
-
-                    out = lenet(img)
-                    acc_top1 = fluid.layers.accuracy(
-                        input=out, label=label, k=1)
-                    acc_top5 = fluid.layers.accuracy(
-                        input=out, label=label, k=5)
-
-                    if batch_id % 100 == 0:
-                        _logger.info(
-                            "Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}".
-                            format(epoch, batch_id,
-                                   acc_top1.numpy(), acc_top5.numpy()))
-
-            # save weights
-            model_dict = lenet.state_dict()
-            fluid.save_dygraph(model_dict, "save_temp")
-
-            # test the correctness of `paddle.jit.save`
-            data = next(test_reader())
-            test_data = np.array([x[0].reshape(1, 28, 28)
-                                  for x in data]).astype('float32')
-            test_img = fluid.dygraph.to_variable(test_data)
-            lenet.eval()
-            before_save = lenet(test_img)
-
-        # save inference quantized model
-        paddle.jit.save(
-            layer=lenet,
-            path=TestImperativeAddQuantDequant.save_path,
-            input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, 1, 28, 28], dtype='float32')
-            ])
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             dirname=TestImperativeAddQuantDequant.root_path,
-             executor=exe,
-             model_filename="lenet" + INFER_MODEL_SUFFIX,
-             params_filename="lenet" + INFER_PARAMS_SUFFIX)
-        after_save, = exe.run(inference_program,
-                              feed={feed_target_names[0]: test_data},
-                              fetch_list=fetch_targets)
-
-        self.assertTrue(
-            np.allclose(after_save, before_save.numpy()),
-            msg='Failed to save the inference quantized model.')
-
-    def test_qat_acc(self):
-        def _build_static_lenet(main, startup, is_test=False, seed=1000):
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, startup):
-                    main.random_seed = seed
-                    startup.random_seed = seed
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32')
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64')
-                    prediction = StaticLenet(img)
-                    if not is_test:
-                        loss = fluid.layers.cross_entropy(
-                            input=prediction, label=label)
-                        avg_loss = fluid.layers.mean(loss)
-                    else:
-                        avg_loss = prediction
-            return img, label, avg_loss
-
-        reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
-        weight_quantize_type = 'abs_max'
-        activation_quant_type = 'moving_average_abs_max'
-        param_init_map = {}
-        seed = 1000
-        lr = 0.001
-
-        # imperative train
-        _logger.info(
-            "--------------------------dynamic graph qat--------------------------"
-        )
-        imperative_qat = ImperativeQuantAware(
-            weight_quantize_type=weight_quantize_type,
-            activation_quantize_type=activation_quant_type,
-            quantizable_layer_type=[
-                'Conv2D', 'Linear', 'ReLU', 'LeakyReLU', 'ReLU6', 'Tanh',
-                'Swish'
-            ])
-
-        with fluid.dygraph.guard():
-            np.random.seed(seed)
-            fluid.default_main_program().random_seed = seed
-            fluid.default_startup_program().random_seed = seed
-            lenet = ImperativeLenet()
-            fixed_state = {}
-            for name, param in lenet.named_parameters():
-                p_shape = param.numpy().shape
-                p_value = param.numpy()
-                if name.endswith("bias"):
-                    value = np.zeros_like(p_value).astype('float32')
-                else:
-                    value = np.random.normal(
-                        loc=0.0, scale=0.01, size=np.product(p_shape)).reshape(
-                            p_shape).astype('float32')
-                fixed_state[name] = value
-                param_init_map[param.name] = value
-            lenet.set_dict(fixed_state)
-
-            imperative_qat.quantize(lenet)
-            adam = AdamOptimizer(
-                learning_rate=lr, parameter_list=lenet.parameters())
-            dynamic_loss_rec = []
-            lenet.train()
-            for batch_id, data in enumerate(reader()):
-                x_data = np.array([x[0].reshape(1, 28, 28)
-                                   for x in data]).astype('float32')
-                y_data = np.array(
-                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-                img = fluid.dygraph.to_variable(x_data)
-                label = fluid.dygraph.to_variable(y_data)
-
-                out = lenet(img)
-                loss = fluid.layers.cross_entropy(out, label)
-                avg_loss = fluid.layers.mean(loss)
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                lenet.clear_gradients()
-                dynamic_loss_rec.append(avg_loss.numpy()[0])
-                if batch_id % 100 == 0:
-                    _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
-                if batch_id > 500:
-                    break
-            lenet.eval()
-        paddle.jit.save(
-            layer=lenet,
-            path=TestImperativeAddQuantDequant.dynamic_save_path,
-            input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, 1, 28, 28], dtype='float32')
-            ])
-
-        # static graph train
-        _logger.info(
-            "--------------------------static graph qat--------------------------"
-        )
-        static_loss_rec = []
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-
-        main = fluid.Program()
-        infer = fluid.Program()
-        startup = fluid.Program()
-        static_img, static_label, static_loss = _build_static_lenet(
-            main, startup, False, seed)
-        infer_img, _, infer_pre = _build_static_lenet(infer, startup, True,
-                                                      seed)
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                opt = AdamOptimizer(learning_rate=lr)
-                opt.minimize(static_loss)
-
-        scope = core.Scope()
-        with fluid.scope_guard(scope):
-            exe.run(startup)
-        for param in main.all_parameters():
-            param_tensor = scope.var(param.name).get_tensor()
-            param_tensor.set(param_init_map[param.name], place)
-
-        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
-        infer_graph = IrGraph(core.Graph(infer.desc), for_test=True)
-        transform_pass = QuantizationTransformPass(
-            scope=scope,
-            place=place,
-            activation_quantize_type=activation_quant_type,
-            weight_quantize_type=weight_quantize_type,
-            quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'])
-        add_quant_dequant_pass = AddQuantDequantPass(
-            scope=scope,
-            place=place,
-            quantizable_op_type=[
-                'relu', 'leaky_relu', 'relu6', 'tanh', 'swish'
-            ])
-        transform_pass.apply(main_graph)
-        transform_pass.apply(infer_graph)
-        add_quant_dequant_pass.apply(main_graph)
-        add_quant_dequant_pass.apply(infer_graph)
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.fuse_all_reduce_ops = False
-        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
-            loss_name=static_loss.name, build_strategy=build_strategy)
-
-        feeder = fluid.DataFeeder(
-            feed_list=[static_img, static_label], place=place)
-        with fluid.scope_guard(scope):
-            for batch_id, data in enumerate(reader()):
-                loss_v, = exe.run(binary,
-                                  feed=feeder.feed(data),
-                                  fetch_list=[static_loss])
-                static_loss_rec.append(loss_v[0])
-                if batch_id % 100 == 0:
-                    _logger.info('{}: {}'.format('loss', loss_v))
-
-        save_program = infer_graph.to_program()
-        with fluid.scope_guard(scope):
-            fluid.io.save_inference_model("./static_mnist", [infer_img.name],
-                                          [infer_pre], exe, save_program)
-        rtol = 1e-08
-        atol = 1e-10
-        for i, (loss_d,
-                loss_s) in enumerate(zip(dynamic_loss_rec, static_loss_rec)):
-            diff = np.abs(loss_d - loss_s)
-            if diff > (atol + rtol * np.abs(loss_s)):
-                _logger.info(
-                    "diff({}) at {}, dynamic loss = {}, static loss = {}".
-                    format(diff, i, loss_d, loss_s))
-                break
-
-        self.assertTrue(
-            np.allclose(
-                np.array(dynamic_loss_rec),
-                np.array(static_loss_rec),
-                rtol=rtol,
-                atol=atol,
-                equal_nan=True),
-            msg='Failed to do the imperative qat.')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
index f888edfcc977ae..3d2cad388d172e 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
@@ -19,18 +19,13 @@
 import random
 import unittest
 import logging
+
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
-from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.framework import IrGraph
-from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
-from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
-from paddle.fluid.dygraph.container import Sequential
-from paddle.nn import Linear, Conv2D, Softmax
-from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
-from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
+
+from test_imperative_qat import TestImperativeQat
 
 paddle.enable_static()
 
@@ -42,388 +37,14 @@
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
-def StaticLenet(data, num_classes=10):
-    conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-    conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-    fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-    fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-    fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-    conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
-    conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-    fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-    fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-    fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-    conv1 = fluid.layers.conv2d(
-        data,
-        num_filters=6,
-        filter_size=3,
-        stride=1,
-        padding=1,
-        param_attr=conv2d_w1_attr,
-        bias_attr=conv2d_b1_attr)
-    pool1 = fluid.layers.pool2d(
-        conv1, pool_size=2, pool_type='max', pool_stride=2)
-    conv2 = fluid.layers.conv2d(
-        pool1,
-        num_filters=16,
-        filter_size=5,
-        stride=1,
-        padding=0,
-        param_attr=conv2d_w2_attr,
-        bias_attr=conv2d_b2_attr)
-    pool2 = fluid.layers.pool2d(
-        conv2, pool_size=2, pool_type='max', pool_stride=2)
-
-    fc1 = fluid.layers.fc(input=pool2,
-                          size=120,
-                          param_attr=fc_w1_attr,
-                          bias_attr=fc_b1_attr)
-    fc2 = fluid.layers.fc(input=fc1,
-                          size=84,
-                          param_attr=fc_w2_attr,
-                          bias_attr=fc_b2_attr)
-    fc3 = fluid.layers.fc(input=fc2,
-                          size=num_classes,
-                          param_attr=fc_w3_attr,
-                          bias_attr=fc_b3_attr)
-    fc3 = fluid.layers.softmax(fc3, use_cudnn=True)
-
-    return fc3
-
-
-class ImperativeLenet(fluid.dygraph.Layer):
-    def __init__(self, num_classes=10):
-        super(ImperativeLenet, self).__init__()
-        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-        conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
-        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-        self.features = Sequential(
-            Conv2D(
-                in_channels=1,
-                out_channels=6,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=conv2d_w1_attr,
-                bias_attr=conv2d_b1_attr),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2),
-            Conv2D(
-                in_channels=6,
-                out_channels=16,
-                kernel_size=5,
-                stride=1,
-                padding=0,
-                weight_attr=conv2d_w2_attr,
-                bias_attr=conv2d_b2_attr),
-            Pool2D(
-                pool_size=2, pool_type='max', pool_stride=2))
-
-        self.fc = Sequential(
-            Linear(
-                in_features=400,
-                out_features=120,
-                weight_attr=fc_w1_attr,
-                bias_attr=fc_b1_attr),
-            Linear(
-                in_features=120,
-                out_features=84,
-                weight_attr=fc_w2_attr,
-                bias_attr=fc_b2_attr),
-            Linear(
-                in_features=84,
-                out_features=num_classes,
-                weight_attr=fc_w3_attr,
-                bias_attr=fc_b3_attr),
-            Softmax())
-
-    def forward(self, inputs):
-        x = self.features(inputs)
-        x = fluid.layers.flatten(x, 1)
-        x = self.fc(x)
-        return x
-
-
-class TestImperativeQatChannelWise(unittest.TestCase):
-    """
-    QAT = quantization-aware training
-    """
-
-    def test_qat_save(self):
-        imperative_qat = ImperativeQuantAware(
-            weight_quantize_type='channel_wise_abs_max',
-            activation_quantize_type='moving_average_abs_max')
-
-        with fluid.dygraph.guard():
-            lenet = ImperativeLenet()
-            imperative_qat.quantize(lenet)
-            adam = AdamOptimizer(
-                learning_rate=0.001, parameter_list=lenet.parameters())
-            train_reader = paddle.batch(
-                paddle.dataset.mnist.train(), batch_size=32, drop_last=True)
-            test_reader = paddle.batch(
-                paddle.dataset.mnist.test(), batch_size=32)
-
-            epoch_num = 1
-            for epoch in range(epoch_num):
-                lenet.train()
-                for batch_id, data in enumerate(train_reader()):
-                    x_data = np.array([x[0].reshape(1, 28, 28)
-                                       for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-                    img = fluid.dygraph.to_variable(x_data)
-                    label = fluid.dygraph.to_variable(y_data)
-                    out = lenet(img)
-                    acc = fluid.layers.accuracy(out, label)
-                    loss = fluid.layers.cross_entropy(out, label)
-                    avg_loss = fluid.layers.mean(loss)
-                    avg_loss.backward()
-                    adam.minimize(avg_loss)
-                    lenet.clear_gradients()
-                    if batch_id % 100 == 0:
-                        _logger.info(
-                            "Train | At epoch {} step {}: loss = {:}, acc= {:}".
-                            format(epoch, batch_id,
-                                   avg_loss.numpy(), acc.numpy()))
-
-                lenet.eval()
-                for batch_id, data in enumerate(test_reader()):
-                    x_data = np.array([x[0].reshape(1, 28, 28)
-                                       for x in data]).astype('float32')
-                    y_data = np.array(
-                        [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-                    img = fluid.dygraph.to_variable(x_data)
-                    label = fluid.dygraph.to_variable(y_data)
-
-                    out = lenet(img)
-                    acc_top1 = fluid.layers.accuracy(
-                        input=out, label=label, k=1)
-                    acc_top5 = fluid.layers.accuracy(
-                        input=out, label=label, k=5)
-
-                    if batch_id % 100 == 0:
-                        _logger.info(
-                            "Test | At epoch {} step {}: acc1 = {:}, acc5 = {:}".
-                            format(epoch, batch_id,
-                                   acc_top1.numpy(), acc_top5.numpy()))
-
-            # save weights
-            model_dict = lenet.state_dict()
-            fluid.save_dygraph(model_dict, "save_temp")
-
-            # test the correctness of `paddle.jit.save`
-            data = next(test_reader())
-            test_data = np.array([x[0].reshape(1, 28, 28)
-                                  for x in data]).astype('float32')
-            test_img = fluid.dygraph.to_variable(test_data)
-            lenet.eval()
-            before_save = lenet(test_img)
-
-        # save inference quantized model
-        path = "./qat_infer_model/mnist"
-        save_dir = "./qat_infer_model"
-        paddle.jit.save(
-            layer=lenet,
-            path=path,
-            input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, 1, 28, 28], dtype='float32')
-            ])
-
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             dirname=save_dir,
-             executor=exe,
-             model_filename="mnist" + INFER_MODEL_SUFFIX,
-             params_filename="mnist" + INFER_PARAMS_SUFFIX)
-        after_save, = exe.run(inference_program,
-                              feed={feed_target_names[0]: test_data},
-                              fetch_list=fetch_targets)
-
-        self.assertTrue(
-            np.allclose(after_save, before_save.numpy()),
-            msg='Failed to save the inference quantized model.')
-
-    def test_qat_acc(self):
-        def _build_static_lenet(main, startup, is_test=False, seed=1000):
-            with fluid.unique_name.guard():
-                with fluid.program_guard(main, startup):
-                    main.random_seed = seed
-                    startup.random_seed = seed
-                    img = fluid.layers.data(
-                        name='image', shape=[1, 28, 28], dtype='float32')
-                    label = fluid.layers.data(
-                        name='label', shape=[1], dtype='int64')
-                    prediction = StaticLenet(img)
-                    if not is_test:
-                        loss = fluid.layers.cross_entropy(
-                            input=prediction, label=label)
-                        avg_loss = fluid.layers.mean(loss)
-                    else:
-                        avg_loss = prediction
-            return img, label, avg_loss
-
-        reader = paddle.batch(
-            paddle.dataset.mnist.test(), batch_size=32, drop_last=True)
-        weight_quantize_type = 'channel_wise_abs_max'
-        activation_quant_type = 'moving_average_abs_max'
-        param_init_map = {}
-        seed = 1000
-        lr = 0.001
-
-        # imperative train
-        _logger.info(
-            "--------------------------dynamic graph qat--------------------------"
-        )
-        imperative_qat = ImperativeQuantAware(
-            weight_quantize_type=weight_quantize_type,
-            activation_quantize_type=activation_quant_type)
-
-        with fluid.dygraph.guard():
-            np.random.seed(seed)
-            fluid.default_main_program().random_seed = seed
-            fluid.default_startup_program().random_seed = seed
-            lenet = ImperativeLenet()
-            fixed_state = {}
-            for name, param in lenet.named_parameters():
-                p_shape = param.numpy().shape
-                p_value = param.numpy()
-                if name.endswith("bias"):
-                    value = np.zeros_like(p_value).astype('float32')
-                else:
-                    value = np.random.normal(
-                        loc=0.0, scale=0.01, size=np.product(p_shape)).reshape(
-                            p_shape).astype('float32')
-                fixed_state[name] = value
-                param_init_map[param.name] = value
-            lenet.set_dict(fixed_state)
-
-            imperative_qat.quantize(lenet)
-            adam = AdamOptimizer(
-                learning_rate=lr, parameter_list=lenet.parameters())
-            dynamic_loss_rec = []
-            lenet.train()
-            for batch_id, data in enumerate(reader()):
-                x_data = np.array([x[0].reshape(1, 28, 28)
-                                   for x in data]).astype('float32')
-                y_data = np.array(
-                    [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-                img = fluid.dygraph.to_variable(x_data)
-                label = fluid.dygraph.to_variable(y_data)
-
-                out = lenet(img)
-                loss = fluid.layers.cross_entropy(out, label)
-                avg_loss = fluid.layers.mean(loss)
-                avg_loss.backward()
-                adam.minimize(avg_loss)
-                lenet.clear_gradients()
-                dynamic_loss_rec.append(avg_loss.numpy()[0])
-                if batch_id % 100 == 0:
-                    _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
-
-        paddle.jit.save(
-            layer=lenet,
-            path="./dynamic_mnist/model",
-            input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, 1, 28, 28], dtype='float32')
-            ])
-
-        # static graph train
-        _logger.info(
-            "--------------------------static graph qat--------------------------"
-        )
-        static_loss_rec = []
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-
-        main = fluid.Program()
-        infer = fluid.Program()
-        startup = fluid.Program()
-        static_img, static_label, static_loss = _build_static_lenet(
-            main, startup, False, seed)
-        infer_img, _, infer_pre = _build_static_lenet(infer, startup, True,
-                                                      seed)
-        with fluid.unique_name.guard():
-            with fluid.program_guard(main, startup):
-                opt = AdamOptimizer(learning_rate=lr)
-                opt.minimize(static_loss)
-
-        scope = core.Scope()
-        with fluid.scope_guard(scope):
-            exe.run(startup)
-        for param in main.all_parameters():
-            param_tensor = scope.var(param.name).get_tensor()
-            param_tensor.set(param_init_map[param.name], place)
-
-        main_graph = IrGraph(core.Graph(main.desc), for_test=False)
-        infer_graph = IrGraph(core.Graph(infer.desc), for_test=True)
-        transform_pass = QuantizationTransformPass(
-            scope=scope,
-            place=place,
-            activation_quantize_type=activation_quant_type,
-            weight_quantize_type=weight_quantize_type,
-            quantizable_op_type=['conv2d', 'depthwise_conv2d', 'mul'])
-        transform_pass.apply(main_graph)
-        transform_pass.apply(infer_graph)
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.fuse_all_reduce_ops = False
-        binary = fluid.CompiledProgram(main_graph.graph).with_data_parallel(
-            loss_name=static_loss.name, build_strategy=build_strategy)
-
-        feeder = fluid.DataFeeder(
-            feed_list=[static_img, static_label], place=place)
-        with fluid.scope_guard(scope):
-            for batch_id, data in enumerate(reader()):
-                loss_v, = exe.run(binary,
-                                  feed=feeder.feed(data),
-                                  fetch_list=[static_loss])
-                static_loss_rec.append(loss_v[0])
-                if batch_id % 100 == 0:
-                    _logger.info('{}: {}'.format('loss', loss_v))
-
-        save_program = infer_graph.to_program()
-        with fluid.scope_guard(scope):
-            fluid.io.save_inference_model("./static_mnist", [infer_img.name],
-                                          [infer_pre], exe, save_program)
-        rtol = 1e-05
-        atol = 1e-08
-        for i, (loss_d,
-                loss_s) in enumerate(zip(dynamic_loss_rec, static_loss_rec)):
-            diff = np.abs(loss_d - loss_s)
-            if diff > (atol + rtol * np.abs(loss_s)):
-                _logger.info(
-                    "diff({}) at {}, dynamic loss = {}, static loss = {}".
-                    format(diff, i, loss_d, loss_s))
-                break
+class TestImperativeQatChannelWise(TestImperativeQat):
+    def set_vars(self):
+        self.weight_quantize_type = 'channel_wise_abs_max'
+        self.activation_quantize_type = 'moving_average_abs_max'
+        print('weight_quantize_type', self.weight_quantize_type)
 
-        self.assertTrue(
-            np.allclose(
-                np.array(dynamic_loss_rec),
-                np.array(static_loss_rec),
-                rtol=rtol,
-                atol=atol,
-                equal_nan=True),
-            msg='Failed to do the imperative qat.')
+    def test_qat(self):
+        self.run_qat_save()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
index bda02769cea861..bb24f941c625e5 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
@@ -31,6 +31,8 @@
 from paddle.fluid.dygraph.nn import Pool2D
 from paddle.fluid.log_helper import get_logger
 
+from imperative_test_utils import fix_model_dict, train_lenet, ImperativeLenetWithSkipQuant
+
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
     fluid.set_flags({"FLAGS_cudnn_deterministic": True})
@@ -39,144 +41,33 @@
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
-class ImperativeLenet(fluid.dygraph.Layer):
-    def __init__(self, num_classes=10):
-        super(ImperativeLenet, self).__init__()
-        conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1")
-        conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2")
-        fc_w1_attr = fluid.ParamAttr(name="fc_w_1")
-        fc_w2_attr = fluid.ParamAttr(name="fc_w_2")
-        fc_w3_attr = fluid.ParamAttr(name="fc_w_3")
-        conv2d_b1_attr = fluid.ParamAttr(name="conv2d_b_1")
-        conv2d_b2_attr = fluid.ParamAttr(name="conv2d_b_2")
-        fc_b1_attr = fluid.ParamAttr(name="fc_b_1")
-        fc_b2_attr = fluid.ParamAttr(name="fc_b_2")
-        fc_b3_attr = fluid.ParamAttr(name="fc_b_3")
-        self.conv2d_0 = Conv2D(
-            in_channels=1,
-            out_channels=6,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            weight_attr=conv2d_w1_attr,
-            bias_attr=conv2d_b1_attr)
-        self.conv2d_0.skip_quant = True
-
-        self.batch_norm_0 = BatchNorm(6)
-        self.relu_0 = ReLU()
-        self.pool2d_0 = Pool2D(pool_size=2, pool_type='max', pool_stride=2)
-        self.conv2d_1 = Conv2D(
-            in_channels=6,
-            out_channels=16,
-            kernel_size=5,
-            stride=1,
-            padding=0,
-            weight_attr=conv2d_w2_attr,
-            bias_attr=conv2d_b2_attr)
-        self.conv2d_1.skip_quant = False
-
-        self.batch_norm_1 = BatchNorm(16)
-        self.relu6_0 = ReLU6()
-        self.pool2d_1 = Pool2D(pool_size=2, pool_type='max', pool_stride=2)
-        self.linear_0 = Linear(
-            in_features=400,
-            out_features=120,
-            weight_attr=fc_w1_attr,
-            bias_attr=fc_b1_attr)
-        self.linear_0.skip_quant = True
-
-        self.leaky_relu_0 = LeakyReLU()
-        self.linear_1 = Linear(
-            in_features=120,
-            out_features=84,
-            weight_attr=fc_w2_attr,
-            bias_attr=fc_b2_attr)
-        self.linear_1.skip_quant = False
-
-        self.sigmoid_0 = Sigmoid()
-        self.linear_2 = Linear(
-            in_features=84,
-            out_features=num_classes,
-            weight_attr=fc_w3_attr,
-            bias_attr=fc_b3_attr)
-        self.linear_2.skip_quant = False
-        self.softmax_0 = Softmax()
-
-    def forward(self, inputs):
-        x = self.conv2d_0(inputs)
-        x = self.batch_norm_0(x)
-        x = self.relu_0(x)
-        x = self.pool2d_0(x)
-        x = self.conv2d_1(x)
-        x = self.batch_norm_1(x)
-        x = self.relu6_0(x)
-        x = self.pool2d_1(x)
-
-        x = fluid.layers.flatten(x, 1)
-
-        x = self.linear_0(x)
-        x = self.leaky_relu_0(x)
-        x = self.linear_1(x)
-        x = self.sigmoid_0(x)
-        x = self.linear_2(x)
-        x = self.softmax_0(x)
-
-        return x
-
-
 class TestImperativeOutSclae(unittest.TestCase):
     def test_out_scale_acc(self):
         seed = 1000
         lr = 0.1
 
-        imperative_out_scale = ImperativeQuantAware()
+        qat = ImperativeQuantAware()
 
         np.random.seed(seed)
         reader = paddle.batch(
             paddle.dataset.mnist.test(), batch_size=512, drop_last=True)
-        lenet = ImperativeLenet()
-        fixed_state = {}
-        for name, param in lenet.named_parameters():
-            p_shape = param.numpy().shape
-            p_value = param.numpy()
-            if name.endswith("bias"):
-                value = np.zeros_like(p_value).astype('float32')
-            else:
-                value = np.random.normal(
-                    loc=0.0, scale=0.01,
-                    size=np.product(p_shape)).reshape(p_shape).astype('float32')
-            fixed_state[name] = value
-        lenet.set_dict(fixed_state)
-        imperative_out_scale.quantize(lenet)
+
+        lenet = ImperativeLenetWithSkipQuant()
+        lenet = fix_model_dict(lenet)
+        qat.quantize(lenet)
+
         adam = AdamOptimizer(
             learning_rate=lr, parameter_list=lenet.parameters())
         dynamic_loss_rec = []
         lenet.train()
-        for batch_id, data in enumerate(reader()):
-            x_data = np.array([x[0].reshape(1, 28, 28)
-                               for x in data]).astype('float32')
-            y_data = np.array(
-                [x[1] for x in data]).astype('int64').reshape(-1, 1)
-
-            img = fluid.dygraph.to_variable(x_data)
-            label = fluid.dygraph.to_variable(y_data)
-
-            out = lenet(img)
-            loss = fluid.layers.cross_entropy(out, label)
-            avg_loss = fluid.layers.mean(loss)
-            avg_loss.backward()
-            adam.minimize(avg_loss)
-            lenet.clear_gradients()
-            dynamic_loss_rec.append(avg_loss.numpy()[0])
-            if batch_id % 100 == 0:
-                _logger.info('{}: {}'.format('loss', avg_loss.numpy()))
+        loss_list = train_lenet(lenet, reader, adam)
 
         lenet.eval()
 
         path = "./save_dynamic_quant_infer_model/lenet"
         save_dir = "./save_dynamic_quant_infer_model"
 
-        imperative_out_scale.save_quantized_model(
+        qat.save_quantized_model(
             layer=lenet,
             path=path,
             input_spec=[
diff --git a/python/paddle/fluid/contrib/sparsity/__init__.py b/python/paddle/fluid/contrib/sparsity/__init__.py
index f78ea1b1c38b85..b36a79b8ca865e 100644
--- a/python/paddle/fluid/contrib/sparsity/__init__.py
+++ b/python/paddle/fluid/contrib/sparsity/__init__.py
@@ -15,7 +15,22 @@
 
 from __future__ import print_function
 
-from . import utils
-from .utils import *
+from .utils import calculate_density
+from .utils import check_mask_1d
+from .utils import get_mask_1d
+from .utils import check_mask_2d
+from .utils import get_mask_2d_greedy
+from .utils import get_mask_2d_best
+from .utils import create_mask
+from .utils import check_sparsity
+from .utils import MaskAlgo
+from .utils import CheckMethod
+from .asp import decorate, prune_model
+from .asp import set_excluded_layers, reset_excluded_layers
 
-__all__ = utils.__all__
+__all__ = [
+    'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
+    'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
+    'MaskAlgo', 'CheckMethod', 'decorate', 'prune_model', 'set_excluded_layers',
+    'reset_excluded_layers'
+]
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
new file mode 100644
index 00000000000000..fbabc73f37bce5
--- /dev/null
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -0,0 +1,497 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Functions for Auto SParsity (ASP) training and inference.
+"""
+
+import copy
+import numpy as np
+import paddle
+from paddle.fluid import framework, global_scope, program_guard, layers
+from paddle.fluid.initializer import ConstantInitializer
+from paddle.fluid.contrib import sparsity
+from paddle.fluid import core
+
+__all__ = [
+    'decorate', 'prune_model', 'set_excluded_layers', 'reset_excluded_layers'
+]
+
+
+def set_excluded_layers(main_program, param_names):
+    r"""
+    Set parameter name of layers which would not be pruned as sparse weights.
+
+    Args:
+        main_program (Program, optional): Program with model definition and its parameters.
+        param_names (list): A list contains names of parameters.
+    """
+    ASPHelper.set_excluded_layers(
+        main_program=main_program, param_names=param_names)
+
+
+def reset_excluded_layers(main_program=None):
+    r"""
+    Reset exculded layers setting corresponding to :attr:`main_program`. If :attr:`main_program` 
+    is None, then all configurations of excluded_layers would be cleaned.
+
+    Args:
+        main_program (Program, optional): Program with model definition and its parameters.
+    """
+    ASPHelper.reset_excluded_layers(main_program=main_program)
+
+
+def decorate(optimizer):
+    r"""
+    Wrap the given optimizer as a OptimizerWithSparsityGuarantee, 
+    which would insert necessary ops for ASP workflows when calling minimize()
+
+    Args:
+        optimizer (Optimizer): A Optimizer used for training.
+    Returns:
+        OptimizerWithSparsityGuarantee: A wrapper for ASP to decorate `minimize` function of the given optimizer.
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            from paddle.fluid.contrib import sparsity
+
+            main_program = fluid.Program()
+            startup_program = fluid.Program()
+
+            with fluid.program_guard(main_program, startup_program):
+                input_data = fluid.layers.data(name='data', shape=[None, 128])
+                label = fluid.layers.data(name='label', shape=[None, 10])
+                hidden = fluid.layers.fc(input=input_data, num_flatten_dims=-1, size=32, act=None)
+                prob = fluid.layers.fc(input=hidden, num_flatten_dims=-1, size=10, act=None)
+                loss = fluid.layers.mean(fluid.layers.square_error_cost(prob, label))
+
+                optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+
+                optimizer = sparsity.decorate(optimizer)
+                optimizer.minimize(loss, startup_program)
+
+            # When apply distributed training with Fleet
+            import paddle.distributed.fleet as fleet
+
+            optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+            optimizer = sparsity.decorate(optimizer) # Need to be called before `fleet.distributed_optimizer`
+            optimizer = fleet.distributed_optimizer(optimizer)
+            optimizer.minimize(loss, startup_program)
+    """
+    return ASPHelper.decorate(optimizer)
+
+
+def prune_model(place,
+                main_program=None,
+                n=2,
+                m=4,
+                func_name=sparsity.MaskAlgo.MASK_1D,
+                with_mask=True):
+    r"""
+    Pruning parameters of supported layers in :attr:`main_program` via 
+    specified mask generation function given by :attr:`func_name`. This 
+    function supports both training and inference controlled by :attr:`with_mask`.
+    If :attr:`with_mask` is True, it would also prune parameter related ASP mask Variables,
+    else only prunes parameters.
+
+    *Note*: If parameters are supported and in FP16, please set :attr:`n`=2, :attr:`m`=4, 
+    if they in FP32, then :attr:`n`=1, :attr:`m`=2` to further enable Sparse Tensor Core acceleration.
+
+    *Note*: If calling this function with :attr:`with_mask`, it should call `OptimizerWithSparsityGuarantee.minimize` 
+    and initialization (`exe.run(startup_program`)) before (For successfully obtain mask Variable). 
+    Typically set `with_mask` as true for training (have called `OptimizerWithSparsityGuarantee.minimize`) and false for 
+    inference only. To obtain OptimizerWithSparsityGuarantee, please see `sparsity.decoreate()`.
+
+    Args:
+        place (fluid.CPUPlace()|fluid.CUDAPlace(N)): Device place for pruned parameter and mask Variables, and N means the GPU's id. It should be the same as created instance of Executor.
+        main_program (Program, optional): Program with model definition and its parameters. Default is `paddle.static.default_main_program()
+        n (int): n of `n:m` sparse pattern.
+        m (int): m of `n:m` sparse pattern.
+        func_name (MaskAlgo, optional): The function name to generate spase mask. Default is `MaskAlgo.MASK_1D`. All options please refer to `MaskAlgo`.
+        with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True.
+    Returns:
+        dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            from paddle.fluid.contrib import sparsity
+
+            main_program = fluid.Program()
+            startup_program = fluid.Program()
+
+            place = fluid.CUDAPlace(0)
+
+            with fluid.program_guard(main_program, startup_program):
+                input_data = fluid.layers.data(name='data', shape=[None, 128])
+                label = fluid.layers.data(name='label', shape=[None, 10])
+                hidden = fluid.layers.fc(input=input_data, num_flatten_dims=-1, size=32, act=None)
+                prob = fluid.layers.fc(input=hidden, num_flatten_dims=-1, size=10, act=None)
+                loss = fluid.layers.mean(fluid.layers.square_error_cost(prob, label))
+
+                optimizer = decorate(fluid.optimizer.SGD(learning_rate=0.1))
+                optimizer.minimize(optimizer, loss, main_program, startup_program)
+
+            exe = fluid.Executor(place)
+            exe.run(startup_program)
+
+            # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model`
+            sparsity.prune_model(place, main_program, func_name=sparsity.MaskAlgo.MASK_2D_BEST)
+    """
+    return ASPHelper.prune_model(
+        place=place,
+        main_program=main_program,
+        n=n,
+        m=m,
+        func_name=func_name,
+        with_mask=with_mask)
+
+
+class ProgramASPInfo(object):
+    r"""
+    ProgramASPInfo is a container to keep ASP relevant information of Pragrom. It contains three inner-variables:
+    1. __mask_vars (Dictionary): Key is parameter's name and vaule is its corresponding sparse mask Variable object, which is created by `ASPHelper.create_mask_variables`.
+    2. __masks (Dictionary): Key is parameter's name and vaule is its corressponding sparse mask Numpy array, which is created by `ASPHelper.prune_model`.
+    3. __excluded_layers (List): It stores name of layers which should not involve into ASP workflow.
+    """
+
+    def __init__(self):
+        self.__mask_vars = {}
+        self.__masks = {}
+        self.__excluded_layers = []
+
+    def update_mask_vars(self, param_name, var):
+        self.__mask_vars[param_name] = var
+
+    def update_masks(self, param_name, var):
+        self.__masks[param_name] = var
+
+    def update_excluded_layers(self, param_names):
+        self.__excluded_layers.extend(copy.deepcopy(param_names))
+
+    def reset_excluded_layers(self):
+        self.__excluded_layers = []
+
+    @property
+    def mask_vars(self):
+        return self.__mask_vars
+
+    @property
+    def masks(self):
+        return self.__masks
+
+    @property
+    def excluded_layers(self):
+        return self.__excluded_layers
+
+
+class ASPHelper(object):
+    r"""
+    ASPHelper is a collection of Auto SParsity (ASP) functions to enable 
+
+    1. training models with weights in 2:4 sparse pattern on FP16 or 1:2 sparse pattern on FP32 from scratch.
+    2. pruning well-trained models into 2:4 sparse pattern on FP16 or 1:2 sparse pattern on FP32 for fine-tuning.
+    """
+
+    MASK_APPENDDED_NAME = '_asp_mask'
+    SUPPORTED_LAYERS = {'fc': 'w_0', 'linear': 'w_0', 'conv2d': 'w_0'}
+
+    __asp_info = {}
+
+    @classmethod
+    def set_excluded_layers(cls, main_program, param_names):
+        r"""
+        This is the implementation of `sparsity.set_excluded_layers`, for details please see explanation in `sparsity.set_excluded_layers`.
+        """
+        asp_info = cls._get_program_asp_info(main_program)
+        asp_info.update_excluded_layers(param_names)
+
+    @classmethod
+    def reset_excluded_layers(cls, main_program=None):
+        r"""
+        This is the implementation of `sparsity.reset_excluded_layers`, for details please see explanation in `sparsity.reset_excluded_layers`.
+        """
+        if main_program is None:
+            for asp_info in cls.__asp_info:
+                asp_info.reset_excluded_layers()
+        else:
+            cls._get_program_asp_info(main_program).reset_excluded_layers()
+
+    @staticmethod
+    def decorate(optimizer):
+        r"""
+        This is the implementation of `sparsity.decorate`, for details please see explanation in `sparsity.decorate`.
+        """
+        return OptimizerWithSparsityGuarantee(optimizer)
+
+    @classmethod
+    def prune_model(cls,
+                    place,
+                    main_program=None,
+                    n=2,
+                    m=4,
+                    func_name=sparsity.MaskAlgo.MASK_1D,
+                    with_mask=True):
+        r"""
+        This is the implementation of `sparsity.prune_model`, for details please see explanation in `sparsity.prune_model`.
+        """
+        checked_func_name = sparsity.CheckMethod.get_checking_method(func_name)
+
+        if main_program is None:
+            main_program = paddle.static.default_main_program()
+
+        asp_info = cls._get_program_asp_info(main_program)
+        for param in main_program.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(main_program, param.name):
+                weight_tensor = global_scope().find_var(param.name).get_tensor()
+                weight_nparray = np.array(weight_tensor)
+
+                # The double transpose ops here make sure pruning direction consistent with cuSparseLt.
+                # SPMMA in cuSparseLt: D = (AxB) + C, where matrix A (mxk) is sparse matrix.
+                # cuSparseLt would prune matrix A along k dimension.
+                # In sparse training, layer weight matriices is viewed sparse matrix A, so
+                # the math fomula should be 'Act(WX + b)'. However, default fomula in PaddlePaddle
+                #  is 'Act(XW + b)'. For enabling SPMMA, weights and inputs should be transposed 
+                # for computing, Act( (W^T X^T)^T + b). Therefore, we have to prune alog k dimension 
+                # of W^T, which is m dimension of W. Moreove, all mask generating functions in 
+                # sparsity/utils is row-major pruning. That is the reason we have to transpose weight 
+                # matrices beforce invoking create_mask. Then we transpose the result maks to make 
+                # sure its shape to be the same as the input weight.
+                weight_sparse_mask = sparsity.create_mask(
+                    weight_nparray.T, func_name=func_name, n=n, m=m).T
+                weight_pruned_nparray = np.multiply(weight_nparray,
+                                                    weight_sparse_mask)
+                weight_tensor.set(weight_pruned_nparray, place)
+                assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
+                        'Pruning {} weight matrix failure!!!'.format(param.name)
+                if with_mask:
+                    weight_mask_param = global_scope().find_var(
+                        ASPHelper._get_mask_name(param.name))
+                    assert weight_mask_param is not None, \
+                        'Cannot find {} variable, please call ASPHelper.minimize' \
+                        ' and initialization (exe.run(startup_program)) first!'.format(ASPHelper._get_mask_name(param.name))
+                    weight_mask_tensor = weight_mask_param.get_tensor()
+                    weight_mask_tensor.set(weight_sparse_mask, place)
+                asp_info.update_masks(param.name, weight_sparse_mask)
+        return asp_info.masks.copy()
+
+    @staticmethod
+    def _get_mask_name(param_name):
+        r"""
+        Return mask name by given parameter name :attr:`param_name`.
+
+        Args:
+            param_name (string): The name of parameter.
+        Returns:
+            string: The mask name of :attr:`param_name`.
+        """
+        return param_name + ASPHelper.MASK_APPENDDED_NAME
+
+    @staticmethod
+    def _get_not_ASP_relevant_vars(main_program):
+        r"""
+        Get all parameters's Variables in :attr:`main_program` but excluded ASP mask Variables.
+
+        Args:
+            main_program (Program): Program with model definition and its parameters.
+        Returns:
+            list: A list of parameter Variables in :attr:`main_program` (excluded ASP mask Variables).
+        """
+        var_list = []
+        for param in main_program.global_block().all_parameters():
+            if ASPHelper.MASK_APPENDDED_NAME not in param.name:
+                var_list.append(param)
+        return var_list
+
+    @classmethod
+    def _get_program_asp_info(cls, main_program):
+        if not main_program in cls.__asp_info:
+            cls.__asp_info[main_program] = ProgramASPInfo()
+        return cls.__asp_info[main_program]
+
+    @classmethod
+    def _is_supported_layer(cls, main_program, param_name):
+        r"""
+        Verify if given :attr:`param_name` is supported by ASP.
+
+        Args:
+            param_name (string): The name of parameter.
+        Returns:
+            bool: True if it is supported, else False.
+        Examples:
+            .. code-block:: python
+
+              import paddle.fluid as fluid
+              from paddle.fluid.contrib.sparsity.asp import ASPHelper
+
+              main_program = fluid.Program()
+              startup_program = fluid.Program()
+
+              with fluid.program_guard(main_program, startup_program):
+                  input_data = fluid.layers.data(name='data', shape=[None, 128])
+                  fc = fluid.layers.fc(input=input_data, num_flatten_dims=-1, size=32, act=None)
+
+              for param in main_program.global_block().all_parameters():
+                  ASPHelper._is_supported_layer(main_program, param.name)
+              # fc_0.w_0 -> True
+              # fc_0.b_0 -> False
+        """
+        if ASPHelper.MASK_APPENDDED_NAME in param_name:
+            return False
+
+        for layer in cls._get_program_asp_info(main_program).excluded_layers:
+            if layer in param_name:
+                return False
+
+        for name in ASPHelper.SUPPORTED_LAYERS:
+            if name in param_name and \
+               ASPHelper.SUPPORTED_LAYERS[name] in param_name:
+                return True
+        return False
+
+    @classmethod
+    def _minimize(cls,
+                  optimizer,
+                  loss,
+                  main_program=None,
+                  startup_program=None,
+                  parameter_list=None,
+                  no_grad_set=None):
+        r"""
+        This function is a decorator of `minimize` function in `Optimizer`.
+        There are three steps:
+
+        1. Call :attr:`optimizer`.minimize(:attr:`loss`)
+        2. Create sparse mask Tensors according to supported layers in :attr:`main_program`.
+        3. Insert masking ops in the end of parameters update.
+
+        *Note*: Please use `ASP.decorate` instead when applying distributed training with `Fleet`. 
+        (Due to there is a invisiable graphs optimization in `Fleet.minimize()` which make training graph 
+        cannot be modified anymore.)
+
+        Args:
+            optimizer (Optimizer): A Optimizer used for training.
+            loss (Variable): A Variable containing the value to minimize.
+            main_program (Program, optional): Program with model definition and its parameters. Default is `loss.block.program`.
+            startup_program (Program, optional): Program for initializing parameters in `parameter_list`. Default is `paddle.static.default_startup_program()`.
+            parameter_list (Iterable, optional): Iterable of `Variable` or `Variable.name` to update to minimize `loss`. The default value is None, at this time all parameters will be updated.
+            no_grad_set (set, optional): Set of `Variable  or `Variable.name` that don't need to be updated. The default value is None.
+        Returns:
+            list: operators from :attr:`optimizer`.minimize(:attr:`loss`).
+            list: pairs of parameters and their gradients.
+        """
+        if main_program is None:
+            main_program = loss.block.program
+
+        if startup_program is None:
+            startup_program = paddle.static.default_startup_program()
+
+        optimizer_ops, params_and_grads = optimizer.minimize(
+            loss, startup_program, parameter_list, no_grad_set=no_grad_set)
+        cls._create_mask_variables(main_program, startup_program,
+                                   params_and_grads)
+        cls._insert_sparse_mask_ops(main_program, params_and_grads)
+        return optimizer_ops, params_and_grads
+
+    @classmethod
+    def _create_mask_variables(cls, main_program, startup_program,
+                               params_and_grads):
+        r"""
+        Create sparse mask Tensors according to supported layers in :attr:`main_program`.
+        This function is called in second step of `ASPHelper._minimize`
+
+        Args:
+            main_program (Program): Program with model definition and its parameters.
+            startup_program (Program): Program for initializing parameters.
+            params_and_grads (list): Variable pairs of parameters and their gradients.
+        """
+        asp_info = cls._get_program_asp_info(main_program)
+        with program_guard(main_program, startup_program):
+            for param_and_grad in params_and_grads:
+                if ASPHelper._is_supported_layer(main_program,
+                                                 param_and_grad[0].name):
+                    mask_param = layers.create_parameter(
+                        name=param_and_grad[0].name +
+                        ASPHelper.MASK_APPENDDED_NAME,
+                        shape=param_and_grad[0].shape,
+                        dtype=param_and_grad[0].dtype,
+                        default_initializer=ConstantInitializer(value=1.0))
+                    mask_param.stop_gradient = True
+                    mask_param.trainable = False
+                    asp_info.update_mask_vars(param_and_grad[0].name,
+                                              mask_param)
+
+    @classmethod
+    def _insert_sparse_mask_ops(cls, main_program, param_grads):
+        r"""
+        Insert masking ops in the end of parameters update.
+        This function is called in third step of `ASPHelper._minimize`
+
+        Args:
+            main_program (Program): Program with model definition and its parameters.
+            params_and_grads (list): Variable pairs of parameters and their gradients.
+        """
+        block = main_program.global_block()
+        asp_info = cls._get_program_asp_info(main_program)
+        for param_grad in param_grads:
+            if param_grad[0].name in asp_info.mask_vars:
+                block.append_op(
+                    type='elementwise_mul',
+                    inputs={
+                        "X": param_grad[0],
+                        'Y': asp_info.mask_vars[param_grad[0].name]
+                    },
+                    outputs={'Out': param_grad[0]},
+                    attrs={'axis': -1,
+                           'use_mkldnn': False})
+
+
+class OptimizerWithSparsityGuarantee(object):
+    r"""
+    OptimizerWithSparsityGuarantee is a wrapper to decorate `minimize` function of given optimizer by `_minimize` of ASPHelper.
+    The decorated `minimize` function would do three things (exactly same as `ASPHelper._minimize`):
+    1. Call `minimize` function of given optimizer.
+    2. Call `ASPHelper._create_mask_variables` to create mask Variables.
+    3. Call `ASPHelper._insert_sparse_mask_ops` to insert weight masking ops in the end of `loss`'s Program.
+    """
+
+    def __init__(self, optimizer):
+        self._optimizer = optimizer
+        self._learning_rate = optimizer._learning_rate
+        self._learning_rate_map = optimizer._learning_rate_map
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        r"""
+        This function is to call `ASPHelper.minimize()` and return its return
+
+        Args:
+            loss (Variable): A Variable containing the value to minimize.
+            startup_program (Program, optional): Program for initializing parameters in `parameter_list`. Default is `paddle.static.default_startup_program()`.
+            parameter_list (Iterable, optional): Iterable of `Variable` or `Variable.name` to update to minimize `loss`. The default value is None, at this time all parameters will be updated.
+            no_grad_set (set, optional): Set of `Variable  or `Variable.name` that don't need to be updated. The default value is None.
+        Returns:
+            list: operators from :attr:`optimizer`.minimize(:attr:`loss`).
+            list: pairs of parameters and their gradients.
+        """
+        return ASPHelper._minimize(
+            self._optimizer,
+            loss,
+            startup_program=startup_program,
+            parameter_list=parameter_list,
+            no_grad_set=no_grad_set)
diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py
index f1108c327407ff..bb030cbac1beaf 100644
--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -27,7 +27,7 @@
 import threading
 
 __all__ = [
-    'density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
+    'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
     'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
     'MaskAlgo', 'CheckMethod'
 ]
@@ -75,7 +75,7 @@ def get_checking_method(mask_algo):
             CheckMethod.get_checking_method(MaskAlgo.MASK_2D_BEST)
             # CheckMethod.CHECK_2D
         """
-        assert type(mask_algo) == MaskAlgo, \
+        assert isinstance(mask_algo, MaskAlgo), \
                "mask_algo should be MaskAlgo type"
         if mask_algo == MaskAlgo.MASK_1D:
             return CheckMethod.CHECK_1D
@@ -83,7 +83,7 @@ def get_checking_method(mask_algo):
             return CheckMethod.CHECK_2D
 
 
-def density(x):
+def calculate_density(x):
     r"""
     Return the density of the input tensor.
 
@@ -99,15 +99,15 @@ def density(x):
 
           x = np.array([[0, 1, 3, 0],
                         [1, 1, 0, 1]])
-          sparsity.density(x) # 0.625
+          sparsity.calculate_density(x) # 0.625
     """
     x_flattened = x.flatten()
     return float(np.nonzero(x_flattened)[0].size) / x_flattened.size
 
 
-def reshape_1d(mat, m):
+def _reshape_1d(mat, m):
     r"""
-    Reshape the input matrix to shape (-1, m).
+    Reshape the input 2D matrix to shape (-1, m).
     If the second dimension of :attr:`mat` is not a multiples of :attr:`m`, 
     then this function would pad the remainder with 0 before reshaping.
 
@@ -116,11 +116,13 @@ def reshape_1d(mat, m):
         remainder = mat.shape[1] % m
 
     Args:
-        mat (nparray): The input matrix.
+        mat (nparray): The input 2D matrix.
         m (int): The second dimension of reshaped matrix.
     Returns:
         tuple: A pair of the reshaped and padded matrix and the shape of padded matrix (non-reshaping).
     """
+    assert len(mat.shape) == 2, "The input mat should be a 2D matrix!"
+
     remainder = mat.shape[1] % m
     if mat.shape[1] % m > 0:
         mat_padded = np.zeros((mat.shape[0], mat.shape[1] + (m - remainder)))
@@ -165,9 +167,9 @@ def check_mask_1d(mat, n, m):
           sparsity.check_mask_1d(x, 2, 4) # True
     """
     if len(mat.shape) <= 1:
-        mat_flattern, shape = reshape_1d(mat.reshape(1, mat.shape[0]), m)
+        mat_flattern, shape = _reshape_1d(mat.reshape(1, mat.shape[0]), m)
     else:
-        mat_flattern, shape = reshape_1d(mat, m)
+        mat_flattern, shape = _reshape_1d(mat, m)
 
     for sub_mat in mat_flattern:
         if np.nonzero(sub_mat)[0].size > (m - n):
@@ -202,7 +204,7 @@ def get_mask_1d(mat, n, m):
           #          [0, 1, 0, 1]])
           sparsity.check_mask_1d(mask, 2, 4) # True
     """
-    mat_flattern, shape = reshape_1d(mat, m)
+    mat_flattern, shape = _reshape_1d(mat, m)
 
     mask_flattern = np.ones_like(mat_flattern)
     mask = np.ones_like(mat)
@@ -215,9 +217,9 @@ def get_mask_1d(mat, n, m):
     return mask
 
 
-def reshape_2d(mat, m):
+def _reshape_2d(mat, m):
     r"""
-    Reshape the input matrix to shape (-1, :math:`m \times m`).
+    Reshape the input 2D matrix to shape (-1, :math:`m \times m`).
     In each dimension of :attr:`mat`, if it is not a multiples of :attr:`m`, 
     then this function would pad the remainder with 0 before reshaping.
 
@@ -227,11 +229,13 @@ def reshape_2d(mat, m):
         remainder_1 = mat.shape[1] % m
 
     Args:
-        mat (nparray): The input matrix.
+        mat (nparray): The input 2D matrix.
         m (int): The square root of second dimension of reshaped matrix.
     Returns:
         tuple: A pair of the reshaped and padded matrix and the shape of padded matrix (non-reshaping).
     """
+    assert len(mat.shape) == 2, "The input mat should be a 2D matrix!"
+
     remainder_0 = mat.shape[0] % m
     remainder_1 = mat.shape[1] % m
 
@@ -297,7 +301,7 @@ def check_mask_2d(mat, n, m):
                         [1, 1, 0, 1]])
           sparsity.check_mask_2d(x, 2, 4) # True
     """
-    mat_padded, shape = reshape_2d(mat, m)
+    mat_padded, shape = _reshape_2d(mat, m)
     for sub_mat in mat_padded:
         sub_mask = np.absolute(np.squeeze(sub_mat.reshape(m, m))) > 0
         if (np.sum(np.sum(sub_mask, axis=1) > (m-n)) != 0) and \
@@ -338,7 +342,7 @@ def get_mask_2d_greedy(mat, n, m):
           #          [0. 1. 1. 0.]])
           sparsity.check_mask_2d(mask, 2, 4) # True
     """
-    mat_padded, shape = reshape_2d(mat, m)
+    mat_padded, shape = _reshape_2d(mat, m)
     mask_padded = np.zeros_like(mat_padded).reshape(-1, m, m)
 
     for idx in range(len(mat_padded)):
@@ -372,11 +376,11 @@ def get_mask_2d_greedy(mat, n, m):
     return mask[:mat.shape[0], :mat.shape[1]]
 
 
-valid_2d_patterns_lock = threading.Lock()
-valid_2d_patterns = {}
+_valid_2d_patterns_lock = threading.Lock()
+_valid_2d_patterns = {}
 
 
-def compute_valid_2d_patterns(n, m):
+def _compute_valid_2d_patterns(n, m):
     r"""
     Compute all vaild 2D `n:m` sparse patterns.
 
@@ -389,12 +393,12 @@ def compute_valid_2d_patterns(n, m):
     Returns:
         dictionary: A dictionary with key: *m_n* (string) and value: all vaild 2D `n:m` sparse patterns.
     """
-    global valid_2d_patterns_lock
-    global valid_2d_patterns
+    global _valid_2d_patterns_lock
+    global _valid_2d_patterns
 
     valid_key = '{}_{}'.format(m, n)
-    if valid_key in valid_2d_patterns:
-        return valid_2d_patterns[valid_key]
+    if valid_key in _valid_2d_patterns:
+        return _valid_2d_patterns[valid_key]
     else:
         patterns = np.zeros(m)
         patterns[:n] = 1
@@ -407,9 +411,9 @@ def compute_valid_2d_patterns(n, m):
         valid_patterns = np.empty((valid.shape[0], m, m))
         valid_patterns[:] = patterns[valid[:]]
 
-        valid_2d_patterns_lock.acquire()
-        valid_2d_patterns[valid_key] = valid_patterns
-        valid_2d_patterns_lock.release()
+        _valid_2d_patterns_lock.acquire()
+        _valid_2d_patterns[valid_key] = valid_patterns
+        _valid_2d_patterns_lock.release()
 
         return valid_patterns
 
@@ -446,9 +450,9 @@ def get_mask_2d_best(mat, n, m):
           print("L1 norm of `greedy` sparse matrix", np.multiply(mat, mask_greedy).sum()) # 56
           print("L1 norm of `best` sparse matrix", np.multiply(mat, mask_best).sum()) # 61
     """
-    patterns = compute_valid_2d_patterns(n, m)
+    patterns = _compute_valid_2d_patterns(n, m)
 
-    mat_flattern, shape = reshape_2d(mat, m)
+    mat_flattern, shape = _reshape_2d(mat, m)
     mask_flattern = np.ones_like(mat_flattern).reshape(-1, m, m)
     pmax = np.argmax(
         np.matmul(mat_flattern, patterns.reshape(patterns.shape[0], m * m).T),
@@ -504,30 +508,25 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
     dtype = tensor.dtype
     t = tensor.astype(float)
 
-    assert type(func_name) == MaskAlgo, \
+    assert isinstance(func_name, MaskAlgo), \
            "func_name argumet of create_mask is only accepted as type MaskAlgo. " \
            "But got {}".format(type(func_name))
     func = getattr(sys.modules[__name__], func_name.value, None)
     if len(shape) == 1:
         t = t.reshape(1, shape[0])
-        mask = func(t, n=n, m=m)
-        return mask.reshape(shape).astype(dtype)
     elif len(shape) == 2:
         t = t.reshape(shape[0], shape[1])
-        mask = func(t, n=n, m=m)
-        return mask.reshape(shape).astype(dtype)
     elif len(shape) == 3:
         t = t.reshape(shape[0] * shape[1], shape[2])
-        mask = func(t, n=n, m=m)
-        return mask.reshape(shape).astype(dtype)
     # 4d-tensor conv (out, in, h, w) -> (out, in*h*w) in GemmConvKernel Op
     elif len(shape) == 4:
         t = t.reshape(shape[0], shape[1] * shape[2] * shape[3])
-        mask = func(t, n=n, m=m)
-        return mask.reshape(shape).astype(dtype)
     else:
-        assert True, "The dimension of input tensor is not supported in create_mask, " \
-                     "Only dimension < 4 is supported but got {}".format(len(shape))
+        raise ValueError("The dimension of input tensor is not supported in create_mask, " \
+                         "Only dimension < 4 is supported but got {}".format(len(shape)))
+
+    mask = func(t, n=n, m=m)
+    return mask.reshape(shape).astype(dtype)
 
 
 def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
@@ -569,19 +568,15 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
     func = getattr(sys.modules[__name__], func_name.value, None)
     if len(shape) == 1:
         t = t.reshape(1, shape[0])
-        return func(t, n=n, m=m)
     elif len(shape) == 2:
         t = t.reshape(shape[0], shape[1])
-        return func(t, n=n, m=m)
     elif len(shape) == 3:
         t = t.reshape(shape[0] * shape[1], shape[2])
-        return func(t, n=n, m=m)
     # 4d-tensor conv (out, in, h, w) -> (out, in*h*w) in GemmConvKernel Op
     elif len(shape) == 4:
         t = t.reshape(shape[0], shape[1] * shape[2] * shape[3])
-        return func(t, n=n, m=m)
     else:
-        assert True, "The dimension of input tensor is not supported in check_sparsity, " \
-                     "Only dimension < 4 is supported but got {}".format(len(shape))
+        raise ValueError("The dimension of input tensor is not supported in create_mask, " \
+                         "Only dimension < 4 is supported but got {}".format(len(shape)))
 
-    return False
+    return func(t, n=n, m=m)
diff --git a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
index 850b267411ed5d..f43b45553f5f00 100644
--- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
+++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
@@ -73,7 +73,7 @@ def layer_warp(block_func, input, ch_in, ch_out, count, stride):
     return pool
 
 
-def train(use_pure_fp16=True, use_nesterov=False, use_adam=False):
+def train(use_pure_fp16=True, use_nesterov=False, optimizer=""):
     classdim = 10
     data_shape = [3, 32, 32]
     BATCH_SIZE = 32
@@ -96,12 +96,17 @@ def train(use_pure_fp16=True, use_nesterov=False, use_adam=False):
         # Test program
         test_program = train_program.clone(for_test=True)
 
-        if use_adam:
+        if optimizer == "Adam":
             optimizer = paddle.optimizer.AdamW(
                 learning_rate=0.001,
                 epsilon=1e-8,
                 weight_decay=0.0,
                 multi_precision=True)
+        elif optimizer == "Lars":
+            optimizer = paddle.fluid.optimizer.LarsMomentumOptimizer(
+                learning_rate=0.001,
+                momentum=0.9,
+                multi_precision=use_pure_fp16)
         else:
             optimizer = paddle.optimizer.Momentum(
                 learning_rate=0.001,
@@ -169,9 +174,11 @@ def test_resnet_pure_fp16(self):
         if not fluid.core.is_compiled_with_cuda():
             return
 
-        def do_test(use_nesterov=False, use_adam=False):
-            if use_adam:
+        def do_test(use_nesterov=False, optimizer=""):
+            if optimizer == "Adam":
                 suffix = "use Adam"
+            elif optimizer == "Lars":
+                suffix = "use Lars"
             else:
                 suffix = "with Nesterov" if use_nesterov else "without Nesterov"
             with self.scope_prog_guard():
@@ -180,14 +187,14 @@ def do_test(use_nesterov=False, use_adam=False):
                 train_loss_fp16, test_loss_fp16 = train(
                     use_pure_fp16=True,
                     use_nesterov=use_nesterov,
-                    use_adam=use_adam)
+                    optimizer=optimizer)
             with self.scope_prog_guard():
                 print("-----------------FP32 Train {}-----------------".format(
                     suffix))
                 train_loss_fp32, test_loss_fp32 = train(
                     use_pure_fp16=False,
                     use_nesterov=use_nesterov,
-                    use_adam=use_adam)
+                    optimizer=optimizer)
 
             self.assertTrue(
                 np.allclose(
@@ -208,7 +215,8 @@ def do_test(use_nesterov=False, use_adam=False):
 
         do_test(use_nesterov=False)
         do_test(use_nesterov=True)
-        do_test(use_adam=True)
+        do_test(optimizer="Adam")
+        do_test(optimizer="Lars")
 
     @contextlib.contextmanager
     def scope_prog_guard(self):
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 9e931ad40c57a5..7886b6b3f7ad7c 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -269,14 +269,6 @@ def to_list(s):
         from .core_avx import _dygraph_debug_level
         from .core_avx import _switch_tracer
         from .core_avx import _set_paddle_lib_path
-        from .core_avx import _save_static_dict
-        from .core_avx import _load_static_dict
-        from .core_avx import _save_dygraph_dict
-        from .core_avx import _load_dygraph_dict
-        from .core_avx import _save_lod_tensor
-        from .core_avx import _load_lod_tensor
-        from .core_avx import _save_selected_rows
-        from .core_avx import _load_selected_rows
         from .core_avx import _create_loaded_parameter
         from .core_avx import _cuda_synchronize
         from .core_avx import _promote_types_if_complex_exists
@@ -328,14 +320,6 @@ def to_list(s):
         from .core_noavx import _dygraph_debug_level
         from .core_noavx import _switch_tracer
         from .core_noavx import _set_paddle_lib_path
-        from .core_noavx import _save_static_dict
-        from .core_noavx import _load_static_dict
-        from .core_noavx import _save_dygraph_dict
-        from .core_noavx import _load_dygraph_dict
-        from .core_noavx import _save_lod_tensor
-        from .core_noavx import _load_lod_tensor
-        from .core_noavx import _save_selected_rows
-        from .core_noavx import _load_selected_rows
         from .core_noavx import _create_loaded_parameter
         from .core_noavx import _cuda_synchronize
         from .core_noavx import _promote_types_if_complex_exists
diff --git a/python/paddle/fluid/dataloader/collate.py b/python/paddle/fluid/dataloader/collate.py
index 8e90b308b393ed..eaaf4cc2d9f62b 100644
--- a/python/paddle/fluid/dataloader/collate.py
+++ b/python/paddle/fluid/dataloader/collate.py
@@ -78,7 +78,6 @@ def default_collate_fn(batch):
 
     raise TypeError("batch data con only contains: tensor, numpy.ndarray, "
                     "dict, list, number, but got {}".format(type(sample)))
-    return outputs
 
 
 def default_convert_fn(batch):
diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py
index 26bd1f06e12e84..409f55efebc8a6 100644
--- a/python/paddle/fluid/dataloader/worker.py
+++ b/python/paddle/fluid/dataloader/worker.py
@@ -168,6 +168,89 @@ def reraise(self):
         raise self.exc_type(msg)
 
 
+# The function `_generate_states` is adapted from `numpy.random.SeedSequence`
+# from https://github.com/numpy/numpy/blob/main/numpy/random/bit_generator.pyx
+# Here is the copyright:
+
+# SeedSequence is derived from Melissa E. O'Neill's C++11 `std::seed_seq`
+# implementation, as it has a lot of nice properties that we want.
+# https://gist.github.com/imneme/540829265469e673d045
+# http://www.pcg-random.org/posts/developing-a-seed_seq-alternative.html
+
+# The MIT License (MIT)
+
+# Copyright (c) 2015 Melissa E. O'Neill
+# Copyright (c) 2019 NumPy Developers
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+INIT_A = 0x43b0d7e5
+MULT_A = 0x931e8875
+INIT_B = 0x8b51f9dd
+MULT_B = 0x58f38ded
+MIX_MULT_L = 0xca01f9dd
+MIX_MULT_R = 0x4973f715
+XSHIFT = np.dtype(np.uint32).itemsize * 8 // 2
+MASK32 = 0xFFFFFFFF
+
+
+def _generate_states(base_seed=0, worker_id=0):
+    # init hash constant
+    hash_const_A = INIT_A
+    hash_const_B = INIT_B
+
+    def hash(value):
+        nonlocal hash_const_A
+        value = (value ^ hash_const_A) & MASK32
+        hash_const_A = (hash_const_A * MULT_A) & MASK32
+        value = (value * hash_const_A) & MASK32
+        value = (value ^ (value >> XSHIFT)) & MASK32
+        return value
+
+    def mix(x, y):
+        result_x = (MIX_MULT_L * x) & MASK32
+        result_y = (MIX_MULT_R * y) & MASK32
+        result = (result_x - result_y) & MASK32
+        result = (result ^ (result >> XSHIFT)) & MASK32
+        return result
+
+    # init entropys with based_seed and worker_id and calculate pool
+    entropys = [worker_id, base_seed & MASK32, base_seed >> 32, 0]
+    pool = [hash(entropy) for entropy in entropys]
+
+    # mix all bits together
+    for i in range(len(pool)):
+        for j in range(len(pool)):
+            if i != j:
+                pool[j] = mix(pool[j], hash(pool[i]))
+
+    states = []
+    for p in pool:
+        state = (p ^ hash_const_B) & MASK32
+        hash_const_B = (hash_const_B * MULT_B) & MASK32
+        state = (state * hash_const_B) & MASK32
+        state = (state ^ (state >> XSHIFT)) & MASK32
+        states.append(state)
+
+    return states
+
+
 def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
                  auto_collate_batch, collate_fn, init_fn, worker_id,
                  num_workers, use_shared_memory):
@@ -181,6 +264,15 @@ def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
         # set signal handler
         core._set_process_signal_handler()
 
+        # set different numpy seed for each worker
+        try:
+            import numpy as np
+            import time
+        except ImportError:
+            pass
+        else:
+            np.random.seed(_generate_states(int(time.time()), worker_id))
+
         global _worker_info
         _worker_info = WorkerInfo(
             id=worker_id, num_workers=num_workers, dataset=dataset)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index fa168a62de11a9..29eee429ef66ab 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -25,6 +25,7 @@
 from paddle.fluid.dygraph.dygraph_to_static.break_continue_transformer import BreakTransformOptimizer
 from paddle.fluid.dygraph.dygraph_to_static.call_transformer import CallTransformer
 from paddle.fluid.dygraph.dygraph_to_static.cast_transformer import CastTransformer
+from paddle.fluid.dygraph.dygraph_to_static.grad_transformer import GradTransformer
 from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import IfElseTransformer
 from paddle.fluid.dygraph.dygraph_to_static.list_transformer import ListTransformer
 from paddle.fluid.dygraph.dygraph_to_static.logical_transformer import LogicalTransformer
@@ -86,6 +87,7 @@ def transfer_from_node_type(self, node_wrapper):
             PrintTransformer,  # print statement
             CallTransformer,  # transform call recursively
             CastTransformer,  # type casting statement
+            GradTransformer,  # transform paddle.grad to paddle.gradients
         ]
 
         for index, transformer in enumerate(transformers):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py
new file mode 100644
index 00000000000000..f7a59063ae653f
--- /dev/null
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py
@@ -0,0 +1,87 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import gast
+import warnings
+
+from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
+from paddle.fluid.dygraph.dygraph_to_static import utils
+
+
+class GradTransformer(gast.NodeTransformer):
+    """
+    A class transforms dygraph paddle.grad to static graph paddle.gradients. The
+    transformation is applied to support double grad mode.
+    """
+
+    def __init__(self, wrapper_root):
+        assert isinstance(
+            wrapper_root, AstNodeWrapper
+        ), "Input non-AstNodeWrapper node for the initialization of GradTransformer."
+        self.wrapper_root = wrapper_root
+        self.root = wrapper_root.node
+
+    def transform(self):
+        self.visit(self.root)
+
+    def visit_Call(self, node):
+        self.generic_visit(node)
+        if not is_grad_api_node(node):
+            return node
+
+        dygraph_grad_parameters = [
+            "outputs", "inputs", "grad_outputs", "retain_graph", "create_graph",
+            "only_inputs", "allow_unused", "no_grad_vars"
+        ]
+        to_static_grad_param = {
+            "outputs": "targets",
+            "inputs": "inputs",
+            "grad_outputs": "target_gradients",
+            "no_grad_vars": "no_grad_set"
+        }
+        static_keywords = []
+
+        for kw in node.keywords:
+            if kw.arg not in dygraph_grad_parameters or kw.arg not in to_static_grad_param:
+                warnings.warn("paddle.grad has unsupported parameter in jit: " +
+                              kw.arg + ", jit will discard it")
+                continue
+            dygraph_grad_parameters.remove(kw.arg)
+            kw.arg = to_static_grad_param[kw.arg]
+            static_keywords.append(kw)
+
+        for i in range(len(node.args)):
+            arg_name = dygraph_grad_parameters[i]
+            if arg_name not in to_static_grad_param:
+                warnings.warn("paddle.grad has unsupported parameter in jit: " +
+                              kw.arg + ", jit will discard it")
+                continue
+            kw = gast.keyword(
+                arg=to_static_grad_param[arg_name], value=node.args[i])
+            static_keywords.append(kw)
+
+        node.func = gast.parse('paddle.static.gradients').body[0].value
+        node.keywords = static_keywords
+        node.args = []
+        return node
+
+
+def is_grad_api_node(node):
+    assert isinstance(node, gast.Call)
+    api_name = utils.ast_to_source_code(node.func).strip()
+    if utils.is_paddle_api(node):
+        return api_name.endswith("grad")
+    return False
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
index de788487feabc7..5bc1c3d96d9c95 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
@@ -402,7 +402,7 @@ def _modified_vars(child_dict, parent_dict):
             var for var in _vars_with_store(child_dict) if var in parent_dict
         ])
 
-    def _vars_loaded_before_store(ids_dict):
+    def _vars_loaded(ids_dict):
         """
         gast.Param is also a kind of `load` semantic.
         """
@@ -411,8 +411,6 @@ def _vars_loaded_before_store(ids_dict):
             for ctx in ctxs:
                 if isinstance(ctx, (gast.Load, gast.Param)):
                     new_dict[k].append(ctx)
-                elif isinstance(ctx, gast.Store):
-                    break
         return new_dict
 
     # modified vars
@@ -439,8 +437,12 @@ def _vars_loaded_before_store(ids_dict):
     new_vars_in_body_and_orelse = body_new_vars & orelse_new_vars
 
     # 3. new var is created only in one of If.body or If.orelse node, and it used as gast.Load firstly after gast.If node.
+    # TODO(zhhsplendid): the _vars_loaded can be optimized as _vars_loaded_before_store. Because if a variable is stored before load,
+    # the value would change by the store statement, we don't have to return to change the value. However, analysis is
+    # complex because if the IfElse is nested and outer IfElse store statement may not run at all. We will put this optimization
+    # as the future TODO
     used_vars_after_ifelse = set(
-        [var for var in _vars_loaded_before_store(after_ifelse_vars_dict)])
+        [var for var in _vars_loaded(after_ifelse_vars_dict)])
     new_vars_to_create = new_vars_in_one_of_body_or_orelse & used_vars_after_ifelse | new_vars_in_body_and_orelse
 
     # 4. generate return_ids of if/else node.
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index feb8b0f9c9a16e..7910e7a38558ce 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -16,6 +16,7 @@
 import numpy as np
 import six
 
+import paddle
 from paddle.fluid import framework, backward, core
 from paddle.fluid.dygraph import layers
 from paddle.fluid.dygraph.base import switch_to_static_graph
@@ -135,6 +136,7 @@ def __init__(self, main_program, inputs, outputs, parameters=None):
         self._origin_main_program = self._verify_program(main_program)
         self._inner_scope = core.Scope()
         # Set default mode to train
+        self._double_grads = self._get_double_grads(self._origin_main_program)
         self.training = True
 
     @LazyInitialized
@@ -192,30 +194,42 @@ def _prune_unused_params(self, program):
         """
         required_params = []
         for param in self._params:
+            found_param = False
             for block in program.blocks:
-                if param.name in block.vars:
-                    required_params.append(param)
+                for op in block.ops:
+                    if param.name in op.input_arg_names or param.name in op.output_arg_names:
+                        required_params.append(param)
+                        found_param = True
+                        break
+                if found_param:
                     break
 
         self._params = required_params
 
+    def _get_double_grads(self, program):
+        double_grads = []
+        for block in program.blocks:
+            for name in block.vars:
+                if "@GRAD" in name:
+                    var_desc = block.vars[name].desc
+                    var_base = core.VarBase(var_desc.dtype(),
+                                            var_desc.shape(),
+                                            var_desc.name(),
+                                            var_desc.type(), False)
+                    double_grads.append(var_base)
+        return double_grads
+
     def forward(self, inputs):
         in_vars, out_vars, tmp_scope_vec = self._prepare(inputs)
 
-        framework._dygraph_tracer().trace_op(
-            type='run_program',
-            inputs={
-                'X': valid_vars(in_vars),
-                'Params': valid_vars(self._params)
-            },
-            outputs={'Out': valid_vars(out_vars),
-                     'OutScope': tmp_scope_vec},
-            attrs={
-                'global_block': self.program.desc.block(0),
-                'start_op_index': 0,
-                'end_op_index': self._infer_program.desc.block(0).op_size(),
-                'is_test': not self.training
-            })
+        attrs = ('global_block', self.program.desc.block(0), 'start_op_index',
+                 0, 'end_op_index', self._infer_program.desc.block(0).op_size(),
+                 'is_test', not self.training)
+        core.ops.run_program(
+            valid_vars(in_vars),
+            valid_vars(self._params),
+            valid_vars(out_vars), tmp_scope_vec,
+            valid_vars(self._double_grads), *attrs)
 
         restored_nest_out = self._restore_out(out_vars)
         return self._remove_no_value(restored_nest_out)
@@ -242,8 +256,19 @@ def _prepare(self, inputs):
                     place=framework._current_expected_place(),
                     zero_copy=True)
             elif isinstance(value, core.VarBase):
-                var = value
-                var.name = self._inputs[i].desc.name()
+                value.name = self._inputs[i].desc.name()
+                if value.stop_gradient:
+                    # NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times
+                    # into CUDAPlace when it's as input of multi Ops. so we move it in advance
+                    # to avoid this problem.
+                    var = paddle.to_tensor(
+                        value,
+                        dtype=value.dtype,
+                        place=framework._current_expected_place(),
+                        stop_gradient=True)
+                    var.name = value.name
+                else:
+                    var = value
             else:
                 continue
             input_vars.append(var)
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 33eb16f1b2b44c..d5ad3a88e8c241 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -166,29 +166,46 @@ def _get_loaded_var_new_old(program_desc, all_new_old_dict_all):
 
 def _rename_var_program_desc(program_desc, include=None, exclude=None):
     """
-    Change the name of the loaded variables.Use 'unique_name.generate' to avoid duplication
-    e.g. linear_0.tmp_3 ==> linear_0.tmp_1, x ==> x_0.
-    If 'include' is not `None`,variables that are not in include are not renamed.
-    If 'exclude' is not `None`,variables that are in exclude will are not renamed.
+    Change the name of the loaded variables.Use 'unique_name.generate' to avoid duplication.
+    It is used when loading multiple program during inference.
+
+    e.g. linear_0.tmp_3 ==> linear_0.tmp_1, x ==> x_0. For double grad, x@GRAD ==> x_0@GRAD
+    If 'include' is not `None`,variables in include and the corresponding
+      double grad variables (if exist) are renamed.
+    If 'exclude' is not `None`,variables that are in exclude and the
+      corresponding double grad variables (if exist) are not renamed.
 
     Args:
         program_desc(ProgramDesc):the variables in it will be modified.
         include(List):list of names of variables.
         exclude(List):list of names of variables.
+
+    Returns:
+        tuple of (dict_rename_var_new_old, dict_rename_var_old_new)
+        dict_rename_var_new_old is a dict mapping from new name to old name
+        dict_rename_var_old_new is a dict mapping from old name to new name
     """
     dict_rename_var_old_new = dict()
     dict_rename_var_new_old = dict()
     old_names = []
+    # Store all old names
     for b_idx in six.moves.range(program_desc.num_blocks()):
         cur_block = program_desc.block(b_idx)
         for var in cur_block.all_vars():
             old_names.append(var.name())
+
+    # Create dict_rename_var_new_old and dict_rename_var_old_new for non double
+    # grad variables
+    has_double_grad = False
     for b_idx in six.moves.range(program_desc.num_blocks()):
         cur_block = program_desc.block(b_idx)
         for var_idx, var in enumerate(cur_block.all_vars()):
             name_old = var.name()
+            is_double_grad_var = "@GRAD" in name_old
+            has_double_grad = has_double_grad or is_double_grad_var
             should_rename = (include is None or name_old in include) and (
-                exclude is None or name_old not in exclude)
+                exclude is None or
+                name_old not in exclude) and not is_double_grad_var
             if should_rename:
                 temp_name = name_old.split('_')
                 if len(temp_name) > 1 and temp_name[-1].isnumeric():
@@ -206,9 +223,29 @@ def _rename_var_program_desc(program_desc, include=None, exclude=None):
             if name_old != name_new:
                 cur_block._rename_var(
                     cpt.to_bytes(name_old), cpt.to_bytes(name_new))
-            dict_rename_var_old_new[name_old] = name_new
-            dict_rename_var_new_old[name_new] = name_old
-
+            if not is_double_grad_var:
+                dict_rename_var_old_new[name_old] = name_new
+                dict_rename_var_new_old[name_new] = name_old
+
+    # Handle double grad names
+    if has_double_grad:
+        double_grad_rename_dict = {}
+        for name_old in dict_rename_var_old_new:
+            for b_idx in six.moves.range(program_desc.num_blocks()):
+                cur_block = program_desc.block(b_idx)
+                for var_idx, var in enumerate(cur_block.all_vars()):
+                    var_name = var.name()
+                    if "@GRAD" in var_name and name_old in var_name:
+                        new_var_name = var_name.replace(
+                            name_old, dict_rename_var_old_new[name_old])
+                        double_grad_rename_dict[var_name] = new_var_name
+        for var_name in double_grad_rename_dict:
+            dict_rename_var_old_new[var_name] = double_grad_rename_dict[
+                var_name]
+            dict_rename_var_new_old[double_grad_rename_dict[
+                var_name]] = var_name
+
+    # Rename on program desc
     for b_idx in six.moves.range(program_desc.num_blocks()):
         cur_block = program_desc.block(b_idx)
         for op_idx in six.moves.range(cur_block.op_size()):
@@ -220,6 +257,11 @@ def _rename_var_program_desc(program_desc, include=None, exclude=None):
                         op._rename_input(
                             input_arg_name,
                             dict_rename_var_old_new[input_arg_name])
+                        if cur_block.has_var(cpt.to_bytes(input_arg_name)):
+                            cur_block._rename_var(
+                                cpt.to_bytes(input_arg_name),
+                                cpt.to_bytes(dict_rename_var_old_new[
+                                    input_arg_name]))
             for output_arg_name in op.output_arg_names():
                 if output_arg_name in dict_rename_var_old_new:
                     if output_arg_name != dict_rename_var_old_new[
@@ -227,6 +269,11 @@ def _rename_var_program_desc(program_desc, include=None, exclude=None):
                         op._rename_output(
                             output_arg_name,
                             dict_rename_var_old_new[output_arg_name])
+                        if cur_block.has_var(cpt.to_bytes(output_arg_name)):
+                            cur_block._rename_var(
+                                cpt.to_bytes(output_arg_name),
+                                cpt.to_bytes(dict_rename_var_old_new[
+                                    output_arg_name]))
     program_desc.flush()
     return dict_rename_var_new_old, dict_rename_var_old_new
 
@@ -267,9 +314,10 @@ class _ProgramHolder(object):
     def __init__(self, program_desc):
         super(_ProgramHolder, self).__init__()
 
-        # input, output, persistable var info
+        # input, output, persistable, double_grads var info
         self._input_descs = []
         self._output_descs = []
+        self._double_grad_descs = []
         self._persistable_names = []
 
         # execution scope
@@ -277,7 +325,6 @@ def __init__(self, program_desc):
 
         # append suffix var name dict
         self._suffix_varname_dict = None
-
         # forward program
         self._infer_program_desc = self._preprocess(program_desc)
         # forward + backward program
@@ -304,6 +351,10 @@ def output_descs(self):
     def persistable_names(self):
         return self._persistable_names
 
+    @property
+    def double_grad_descs(self):
+        return self._double_grad_descs
+
     @property
     def scope(self):
         return self._inner_scope
@@ -347,6 +398,12 @@ def _preprocess(self, program_desc):
         for op_idx in reversed(ops_to_remove):
             root_block._remove_op(op_idx, op_idx + 1)
 
+        for i in range(program_desc.num_blocks()):
+            block_desc = program_desc.block(i)
+            for var_desc in block_desc.all_vars():
+                if "@GRAD" in var_desc.name():
+                    self._double_grad_descs.append(var_desc)
+
         # 2. Input processing, reverse feed vars
         self._input_descs.reverse()
 
@@ -412,7 +469,6 @@ def _append_backward_desc(self, infer_program_desc):
         # rewrite a series of methods for append_backward for program_desc. 
         # Therefore, in order to reuse the method of backward.py, build the program here.
         program = _build_program_by_desc(program_desc_copy)
-
         # 3. Add the outputs which is only used for training and not saved in
         # inference program.
         for block_idx in six.moves.range(program.num_blocks):
@@ -738,6 +794,20 @@ def _run_dygraph(instance, input, program_holder):
                                  core.VarDesc.VarType.STEP_SCOPES, True)
     tmp_scope_vec.value().set_scope(program_holder.scope)
 
+    double_grad_vars = []
+    for var_desc in program_holder.double_grad_descs:
+        var = core.VarBase(var_desc.dtype(),
+                           var_desc.shape(),
+                           var_desc.name(), var_desc.type(), False)
+        double_grad_vars.append(var)
+    if len(double_grad_vars) == 0:
+        double_grad_vars = [
+            core.VarBase(
+                value=[1],
+                name='Fake_var',
+                place=framework._current_expected_place())
+        ]
+
     # 2. run program by op
     trace_program = program_holder.infer_program if instance._is_test else program_holder.train_program
     end_op_index = program_holder.infer_program.block(0).op_size()
@@ -745,8 +815,11 @@ def _run_dygraph(instance, input, program_holder):
         type='run_program',
         inputs={'X': input_vars,
                 'Params': persistable_vars},
-        outputs={'Out': output_vars,
-                 'OutScope': tmp_scope_vec},
+        outputs={
+            'Out': output_vars,
+            'OutScope': tmp_scope_vec,
+            'DOut': double_grad_vars
+        },
         attrs={
             'global_block': trace_program.block(0),
             'start_op_index': 0,
diff --git a/python/paddle/fluid/dygraph/layer_hooks.py b/python/paddle/fluid/dygraph/layer_hooks.py
new file mode 100644
index 00000000000000..e9c6867cb7c8ba
--- /dev/null
+++ b/python/paddle/fluid/dygraph/layer_hooks.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import warnings
+
+from paddle.fluid.framework import default_main_program, in_dygraph_mode
+
+
+class LayerOpsRecoder:
+    """
+    Record generated operators information in nn.Layer.
+    """
+
+    def __init__(self, start=-1, end=-1, ops=None, is_valid=False, hooks=None):
+        self.start = start
+        self.end = end
+        self.ops = ops
+        self.is_valid = is_valid
+        self.hooks = hooks
+
+
+def record_program_ops_pre_hook(layer, inputs):
+    """
+    A pre-hook to mark op numbers before enter layer.forward.
+    """
+    if not in_dygraph_mode():
+        if layer._op_recorder.start < 0:
+            layer._op_recorder.start = len(default_main_program().current_block(
+            ).ops)
+            layer._op_recorder.is_valid = True
+        else:
+            layer._op_recorder.is_valid = False
+            warnings.warn(
+                "{} has recorded the op information before. Please check whether you call this layer twice.".
+                format(layer._full_name))
+
+    return None
+
+
+def set_op_customized_attrs_post_hook(layer, inputs, outputs):
+    """
+    A post-hook to append customized attributes into all operators generated in current layer.
+    """
+    if not in_dygraph_mode() and layer._op_recorder.is_valid:
+
+        start = layer._op_recorder.start
+        end = len(default_main_program().current_block().ops)
+        assert (start >= 0 and end >= start)
+        ops = default_main_program().current_block().ops[start:end]
+
+        layer._op_recorder.end = end
+        layer._op_recorder.ops = ops
+
+        for op in ops:
+            for attr_name, val in six.iteritems(layer._customized_attrs):
+                op._set_attr(attr_name, val)
+
+        # remove pre-hook and post-hook
+        for hook_helper in layer._op_recorder.hooks:
+            hook_helper.remove()
+
+    return None
diff --git a/python/paddle/fluid/dygraph/layer_object_helper.py b/python/paddle/fluid/dygraph/layer_object_helper.py
index a904f80639752a..5bf5eda19a5d0c 100644
--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ b/python/paddle/fluid/dygraph/layer_object_helper.py
@@ -16,7 +16,7 @@
 
 import copy
 import six
-from ..framework import Parameter, in_dygraph_mode
+from ..framework import Parameter, in_dygraph_mode, _global_flags
 from ..param_attr import ParamAttr
 from .. import core
 from six.moves import zip
@@ -158,7 +158,7 @@ def append_activation(self, input_var, act=None, use_cudnn=None):
 
         if (use_cudnn is not None) and use_cudnn:
             act['use_cudnn'] = use_cudnn
-        use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+        use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
         if (use_mkldnn is not None) and use_mkldnn:
             act['use_mkldnn'] = use_mkldnn
         act_type = act.pop('type')
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index ecf6be1a0224af..cb7666b353db79 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -30,6 +30,7 @@
 from .. import unique_name
 from paddle.fluid import core
 from .layer_object_helper import LayerObjectHelper
+from .layer_hooks import record_program_ops_pre_hook, set_op_customized_attrs_post_hook, LayerOpsRecoder
 from .base import program_desc_tracing_guard, param_guard
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
@@ -113,6 +114,10 @@ def __init__(self, name_scope=None, dtype="float32"):
         self._sub_layers = collections.OrderedDict()
         self._loaddict_holder = collections.OrderedDict()
 
+        # Record generated op_descs in this layer
+        self._op_recorder = LayerOpsRecoder(ops=[], hooks=[])
+        self._customized_attrs = {}
+
         self._forward_pre_hooks = collections.OrderedDict()
         self._forward_post_hooks = collections.OrderedDict()
 
@@ -665,7 +670,7 @@ def named_sublayers(self, prefix='', include_self=False, layers_set=None):
         Parameters:
             prefix(str, optional): Prefix to prepend to all parameter names. Default: ''.
             include_self(bool, optional): Whether include the Layer itself. Default: False.
-            layers_set(set, optioanl): The set to record duplicate sublayers. Default: None.
+            layers_set(set, optional): The set to record duplicate sublayers. Default: None.
 
         Yields:
             (string, Layer): Tuple of name and Layer
@@ -1028,6 +1033,54 @@ def forward(self, input):
             self._parameters[name] = parameter
         return parameter
 
+    def _set_op_attrs(self, attrs):
+        """
+        Add customized attribute while append_op. In case of quantization, we want to save
+        some attributes into op_desc while exporting inference model by @to_static.
+
+        Arguments:
+            attrs(dict): customized attributes that will be added into op_descs.
+
+        NOTE: The interface is only exposed to developers.
+        """
+
+        def is_already_registered(is_pre_hook):
+            layers_hooks = self._forward_pre_hooks if is_pre_hook else self._forward_post_hooks
+            candidate_hook = record_program_ops_pre_hook if is_pre_hook else set_op_customized_attrs_post_hook
+
+            already_registed = False
+            if layers_hooks:
+                last_key = next(reversed(layers_hooks))
+                already_registed = (layers_hooks[last_key] == candidate_hook)
+
+            return already_registed
+
+        if not isinstance(attrs, dict):
+            raise TypeError("attrs should be type(dict), but received {}".
+                            format(type(attrs).__name__))
+
+        # NOTE: Overwrite behavior for same key.
+        self._customized_attrs.update(attrs)
+
+        if not is_already_registered(is_pre_hook=True):
+            pre_hook_helper = self.register_forward_pre_hook(
+                record_program_ops_pre_hook)
+            assert len(self._op_recorder.hooks) == 0
+            self._op_recorder.hooks = [pre_hook_helper]
+
+        # manually register post_hook to ensure it is inserted into the head.
+        if not is_already_registered(is_pre_hook=False):
+            post_hook_helper = self.register_forward_post_hook(
+                set_op_customized_attrs_post_hook)
+            if len(self._forward_post_hooks) > 1:
+                self._forward_post_hooks.move_to_end(
+                    post_hook_helper._hook_id, last=False)
+
+            assert len(self._op_recorder.hooks) == 1
+
+            # hooks that need to be removed once we finish executing them.
+            self._op_recorder.hooks.append(post_hook_helper)
+
     def __getstate__(self):
         return self.__dict__
 
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index e39fc3e23fe564..a014e0a722ab32 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -46,9 +46,7 @@
     '__rsub__',
     '__mul__',
     '__rmul__',
-    '__div__',
     '__truediv__',
-    '__rdiv__',
     '__rtruediv__',
     '__matmul__',
 ]
@@ -168,9 +166,6 @@ def _scalar_rsub_(var, value):
     def _scalar_mul_(var, value):
         return _scalar_elementwise_op_(var, value, 0.0)
 
-    def _scalar_div_(var, value):
-        return _scalar_elementwise_op_(var, 1.0 / value, 0.0)
-
     # for binary operator such as elementwise, compare
     def _binary_creator_(method_name,
                          op_type,
@@ -201,7 +196,10 @@ def __impl__(self, other_var):
                 if op_type == 'elementwise_div' and self.dtype in _supported_int_dtype_:
                     self = astype(self, 'float32')
                 # here use `scale` replace `elementwise` to get better performance
-                # but only +, -, *, / can use this method
+                # but only +, -, * can use this method
+                # NOTE(chentianyu03): / can not use `scale` method，because the result of
+                # `scale` method (self*(1/other_var)) do not exactly equal with the result 
+                # of `elementwise_div` method.
                 if scalar_method is not None:
                     return scalar_method(self, other_var)
             else:
@@ -288,12 +286,8 @@ def __impl__(self, other_var):
         ## a*b == b*a. Do not need to reverse explicitly
         ('__rmul__',
          _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
-        ('__div__', _binary_creator_('__div__', 'elementwise_div', False,
-                                     _scalar_div_)),
         ('__truediv__', _binary_creator_('__truediv__', 'elementwise_div',
-                                         False, _scalar_div_)),
-        ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
-                                      None)),
+                                         False, None)),
         ('__rtruediv__', _binary_creator_('rtruediv__', 'elementwise_div', True,
                                           None)),
         ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index ce728f1121dfdb..9d6e637342a7b6 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -21,7 +21,7 @@
 from ..layers import nn as F
 from .. import dygraph_utils
 from . import layers
-from ..framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, _varbase_creator, default_main_program
+from ..framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter, _dygraph_tracer, _varbase_creator, default_main_program, _global_flags
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..param_attr import ParamAttr
 from ..initializer import Normal, Constant, NumpyArrayInitializer
@@ -188,7 +188,7 @@ def __init__(self,
         if not isinstance(use_cudnn, bool):
             raise ValueError("use_cudnn should be True or False")
         self._use_cudnn = use_cudnn
-        self._use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
         self._filter_size = filter_size
         self._num_filters = num_filters
         self._param_attr = param_attr
@@ -837,7 +837,7 @@ def __init__(self,
         if not isinstance(use_cudnn, bool):
             raise ValueError("use_cudnn should be True or False")
 
-        self._use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
 
         if data_format not in ["NCHW", "NHWC"]:
             raise ValueError(
@@ -966,7 +966,7 @@ def __init__(self,
         self.bias = self.create_parameter(
             shape=[output_dim], attr=bias_attr, dtype=dtype, is_bias=True)
 
-        self._use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
 
     def forward(self, input):
         if in_dygraph_mode():
@@ -1268,7 +1268,7 @@ def __init__(self,
         self._param_attr = param_attr
         self._bias_attr = bias_attr
         self._act = act
-        self._use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+        self._use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
 
         assert bias_attr is not False, "bias_attr should not be False in batch_norm."
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index bffeaf2c6c973e..695c91fea819f5 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -72,6 +72,7 @@
 _global_expected_place_ = None
 _current_device = None
 global_prog_seed = 0
+_global_flags_ = core.globals()
 
 
 def require_version(min_version, max_version=None):
@@ -286,6 +287,10 @@ def _dygraph_tracer():
     return _dygraph_tracer_
 
 
+def _global_flags():
+    return _global_flags_
+
+
 def _current_expected_place():
     global _global_expected_place_
     if _global_expected_place_ is None:
@@ -2142,7 +2147,7 @@ def _to_readable_code(self, skip_op_callstack=True):
         """
         assert isinstance(
             skip_op_callstack, bool
-        ), "skip_op_callstack parameter's type is error, expect bool, received %s".format(
+        ), "skip_op_callstack parameter's type is error, expect bool, received {}".format(
             type(skip_op_callstack))
         outputs_str = "{"
         for i in range(0, len(self.output_names)):
@@ -2550,7 +2555,7 @@ def _to_readable_code(self, skip_op_callstack=True):
         """
         assert isinstance(
             skip_op_callstack, bool
-        ), "skip_op_callstack parameter's type is error, expect bool, received %s".format(
+        ), "skip_op_callstack parameter's type is error, expect bool, received {}".format(
             type(skip_op_callstack))
         block_str = "{ // block "
         block_str += "{}\n".format(self.idx)
@@ -4259,7 +4264,7 @@ def _to_readable_code(self, skip_op_callstack=True):
         """
         assert isinstance(
             skip_op_callstack, bool
-        ), "skip_op_callstack parameter's type is error, expect bool, received %s".format(
+        ), "skip_op_callstack parameter's type is error, expect bool, received {}".format(
             type(skip_op_callstack))
         program_str = ""
         for block in self.blocks:
@@ -5833,8 +5838,8 @@ def set_flags(flags):
     if not isinstance(flags, dict):
         raise TypeError('flags in set_flags should be a dict')
     for key, value in flags.items():
-        if core.globals().is_public(key):
-            core.globals()[key] = value
+        if _global_flags().is_public(key):
+            _global_flags()[key] = value
         else:
             raise ValueError(
                 "Flag %s cannot set its value through this function." % (key))
@@ -5863,8 +5868,8 @@ def get_flags(flags):
     flags_value = {}
     if isinstance(flags, (list, tuple)):
         for key in flags:
-            if (core.globals().is_public(key)):
-                value = core.globals()[key]
+            if (_global_flags().is_public(key)):
+                value = _global_flags()[key]
                 temp = {key: value}
                 flags_value.update(temp)
             else:
@@ -5872,8 +5877,8 @@ def get_flags(flags):
                     'Flag %s cannot get its value through this function.' %
                     (key))
     elif isinstance(flags, str):
-        if (core.globals().is_public(flags)):
-            value = core.globals()[flags]
+        if (_global_flags().is_public(flags)):
+            value = _global_flags()[flags]
             temp = {flags: value}
             flags_value.update(temp)
         else:
diff --git a/python/paddle/fluid/incubate/fleet/utils/hdfs.py b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
index 94a371ae3fb5bb..fe09692531ad3a 100644
--- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py
+++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py
@@ -268,8 +268,7 @@ def mv(self, fs_src_path, fs_dst_path, overwrite=False, test_exists=True):
                     fs_src_path))
 
             if self.is_exist(fs_dst_path):
-                raise FSFileExistsError("{} exists already".format(
-                    fs_src_path, fs_dst_path, fs_dst_path))
+                raise FSFileExistsError("{} exists already".format(fs_dst_path))
 
         return self._try_mv(fs_src_path, fs_dst_path)
 
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 5b2010f3409580..54ba5f22e53d6c 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -152,6 +152,7 @@ def __call__(self, var, block=None):
             out_dtype = var.dtype
             out_var = var
 
+        # fill constant should set the "str_value" to preserve precision
         op = block.append_op(
             type="fill_constant",
             outputs={"Out": out_var},
@@ -159,6 +160,7 @@ def __call__(self, var, block=None):
                 "shape": var.shape,
                 "dtype": int(out_dtype),
                 "value": float(self._value),
+                'str_value': str(float(self._value)),
                 'force_cpu': self._force_cpu
             },
             stop_gradient=True)
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 30a0b4053e6ffa..2d3578c6c104b0 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -23,6 +23,7 @@
 import contextlib
 from functools import reduce
 import sys
+from io import BytesIO
 
 import numpy as np
 import math
@@ -71,6 +72,52 @@
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
 
 
+class _open_buffer(object):
+    def __init__(self, buffer):
+        self.buffer = buffer
+
+    def __enter__(self):
+        return self.buffer
+
+
+class _buffer_reader(_open_buffer):
+    def __init__(self, buffer):
+        super(_buffer_reader, self).__init__(buffer)
+        self.initial_tell = self.buffer.tell()
+
+    def __exit__(self, *args):
+        # `args[0]` is type of exception. When the `read` is abnormal, the file pointer returns to the initial position.
+        if args[0] is not None:
+            self.buffer.seek(self.initial_tell)
+
+
+class _buffer_writer(_open_buffer):
+    def __exit__(self, *args):
+        self.buffer.flush()
+
+
+def _is_file_path(path):
+    return isinstance(path, str)
+
+
+def _open_file_buffer(path_or_buffer, mode):
+
+    if _is_file_path(path_or_buffer):
+        return open(path_or_buffer, mode)
+    else:
+        if 'w' in mode:
+            return _buffer_writer(path_or_buffer)
+        elif 'r' in mode:
+            return _buffer_reader(path_or_buffer)
+        else:
+            raise ValueError("Expected 'r' or 'w' in mode but got {}".format(
+                mode))
+
+
+def _is_memory_buffer(buffer):
+    return isinstance(buffer, BytesIO)
+
+
 def is_parameter(var):
     """
     Check whether the given variable is an instance of Parameter.
@@ -1776,14 +1823,16 @@ def get_tensor(var):
     param_dict = {name: get_tensor(param_dict[name]) for name in param_dict}
 
     # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
-    if sys.platform == 'darwin' and sys.version_info.major == 3:
+    if _is_file_path(
+            model_path
+    ) and sys.platform == 'darwin' and sys.version_info.major == 3:
         pickle_bytes = pickle.dumps(param_dict, protocol=protocol)
         with open(model_path, 'wb') as f:
             max_bytes = 2**30
             for i in range(0, len(pickle_bytes), max_bytes):
                 f.write(pickle_bytes[i:i + max_bytes])
     else:
-        with open(model_path, 'wb') as f:
+        with _open_file_buffer(model_path, 'wb') as f:
             pickle.dump(param_dict, f, protocol=protocol)
 
 
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index db556913384785..2b677c11e9d96b 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -17,7 +17,7 @@
 import copy
 import six
 
-from .framework import Parameter, dtype_is_floating, in_dygraph_mode, OpProtoHolder
+from .framework import Parameter, dtype_is_floating, in_dygraph_mode, OpProtoHolder, _global_flags
 from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
 from .param_attr import ParamAttr
@@ -148,7 +148,7 @@ def append_activation(self, input_var):
         if 'use_cudnn' in self.kwargs and self.kwargs.get('use_cudnn'):
             act['use_cudnn'] = self.kwargs.get('use_cudnn')
         use_mkldnn = self.kwargs.get(
-            'use_mkldnn', core.globals().get("FLAGS_use_mkldnn", False))
+            'use_mkldnn', _global_flags().get("FLAGS_use_mkldnn", False))
         if use_mkldnn:
             act['use_mkldnn'] = use_mkldnn
         act_type = act.pop('type')
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index e9738b6660eeaf..c2de5670eb42c1 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -312,6 +312,10 @@ def create_parameter(self,
         if not attr:
             return None
         assert isinstance(attr, ParamAttr)
+        for i, size in enumerate(shape):
+            assert size > 0, (
+                "Expected every dim's size to be larger than 0, "
+                "but the size of the {}-th dim is {}".format(i, size))
         # set global dtype
         if not dtype:
             dtype = self.__dtype
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index cf4abc207bd754..604bcc0e277769 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -1139,7 +1139,9 @@ def yolo_box(x,
              downsample_ratio,
              clip_bbox=True,
              name=None,
-             scale_x_y=1.):
+             scale_x_y=1.,
+             iou_aware=False,
+             iou_aware_factor=0.5):
     """
 
     ${comment}
@@ -1156,6 +1158,8 @@ def yolo_box(x,
         name (string): The default value is None.  Normally there is no need 
                        for user to set this property.  For more information, 
                        please refer to :ref:`api_guide_Name`
+        iou_aware (bool): ${iou_aware_comment}
+        iou_aware_factor (float): ${iou_aware_factor_comment}
 
     Returns:
         Variable: A 3-D tensor with shape [N, M, 4], the coordinates of boxes,
@@ -1204,6 +1208,8 @@ def yolo_box(x,
         "downsample_ratio": downsample_ratio,
         "clip_bbox": clip_bbox,
         "scale_x_y": scale_x_y,
+        "iou_aware": iou_aware,
+        "iou_aware_factor": iou_aware_factor
     }
 
     helper.append_op(
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index a2dee91dbef7c0..2a57c1a907aacc 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -39,9 +39,7 @@
     "__rsub__": "A -= B",
     "__mul__": "A * B",
     "__rmul__": "A *= B",
-    "__div__": "A / B",
     "__truediv__": "A / B",
-    "__rdiv__": "A /= B",
     "__rtruediv__": "A /= B",
     "__pow__": "A ** B",
     "__rpow__": "A **= B",
@@ -209,9 +207,6 @@ def _scalar_rsub_(var, value):
     def _scalar_mul_(var, value):
         return _scalar_op_(var, value, 0.0)
 
-    def _scalar_div_(var, value):
-        return _scalar_op_(var, 1.0 / value, 0.0)
-
     def _binary_creator_(method_name,
                          op_type,
                          reverse=False,
@@ -241,7 +236,10 @@ def __impl__(self, other_var):
                 if op_type == 'elementwise_div' and self.dtype in _supported_int_dtype_:
                     self = astype(self, 'float32')
                 # here use `scale` replace `elementwise` to get better performance
-                # but only +, -, *, / can use this method
+                # but only +, -, * can use this method
+                # NOTE(chentianyu03): / can not use `scale` method，because the result of
+                # `scale` method (self*(1/other_var)) do not exactly equal with the result 
+                # of `elementwise_div` method.
                 if scalar_method is not None:
                     return scalar_method(self, other_var)
             else:
@@ -337,12 +335,8 @@ def __impl__(self, other_var):
         #  a*b == b*a. Do not need to reverse explicitly
         ('__rmul__',
          _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
-        ('__div__', _binary_creator_('__div__', 'elementwise_div', False,
-                                     _scalar_div_)),
         ('__truediv__', _binary_creator_('__truediv__', 'elementwise_div',
-                                         False, _scalar_div_)),
-        ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
-                                      None)),
+                                         False, None)),
         ('__rtruediv__', _binary_creator_('__rtruediv__', 'elementwise_div',
                                           True, None)),
         ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ee08cb8654ec13..e02edb72ce1f71 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -26,7 +26,7 @@
 import paddle
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder, in_dygraph_mode, dygraph_only, _dygraph_tracer, default_main_program, _varbase_creator, static_only
+from ..framework import Variable, OpProtoHolder, in_dygraph_mode, dygraph_only, _dygraph_tracer, default_main_program, _varbase_creator, static_only, _global_flags
 from .. import dygraph_utils
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
@@ -9500,7 +9500,7 @@ def relu6(x, threshold=6.0, name=None):
         outputs={'Out': out},
         attrs={
             'threshold': threshold,
-            'use_mkldnn': core.globals()["FLAGS_use_mkldnn"]
+            'use_mkldnn': _global_flags()["FLAGS_use_mkldnn"]
         })
     return out
 
@@ -11093,7 +11093,7 @@ def strided_slice(input, axes, starts, ends, strides):
             Then:
                 result = [ [2], ]
     Args:
-        input (Variable): An N-D ``Tensor`` or ``LoDTensor`` . The data type is ``float32``, ``float64``, ``int32`` or ``int64``.
+        input (Variable): An N-D ``Tensor`` or ``LoDTensor`` . The data type is ``bool``, ``float32``, ``float64``, ``int32`` or ``int64``.
         axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to.
                             It's optional. If it is not provides, it will be treated as :math:`[0,1,...,len(starts)-1]`.
         starts (list|tuple|Variable): The data type is ``int32`` . If ``starts`` is a list or tuple, the elements of
@@ -11144,7 +11144,7 @@ def strided_slice(input, axes, starts, ends, strides):
     helper = LayerHelper('strided_slice', **locals())
 
     check_variable_and_dtype(input, 'input',
-                             ['float32', 'float64', 'int32', 'int64'],
+                             ['bool', 'float32', 'float64', 'int32', 'int64'],
                              'strided_slice')
     check_type(axes, 'axes', (list, tuple), 'strided_slice')
     check_type(starts, 'starts', (list, tuple, Variable), 'strided_slice')
@@ -11569,7 +11569,7 @@ def gen_data():
             axis=axis,
             act=act,
             op_name='elementwise_add',
-            use_mkldnn=core.globals()["FLAGS_use_mkldnn"])
+            use_mkldnn=_global_flags()["FLAGS_use_mkldnn"])
 
     return _elementwise_op(LayerHelper('elementwise_add', **locals()))
 
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 813f671e020706..eee4bbbb1d54fe 100755
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -53,6 +53,7 @@
     'round',
     'reciprocal',
     'square',
+    'lgamma',
 ]
 
 __inplace_unary_func__ = [
@@ -396,6 +397,19 @@
 
 """)
 
+add_sample_code(globals()["lgamma"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+
+        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+        out = paddle.lgamma(x)
+        print(out)
+        # [1.31452441, 1.76149750, 2.25271273, 1.09579802]
+
+""")
+
 add_sample_code(globals()["softplus"], r"""
 Examples:
     .. code-block:: python
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index a62217c628c302..65cc745dbab883 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -36,11 +36,32 @@
 from .utils import check_shape
 
 __all__ = [
-    'create_tensor', 'create_parameter', 'create_global_var', 'cast',
-    'tensor_array_to_tensor', 'concat', 'sums', 'assign',
-    'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
-    'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite',
-    'range', 'linspace', 'zeros_like', 'ones_like', 'diag', 'eye', 'triu'
+    'create_tensor',
+    'create_parameter',
+    'create_global_var',
+    'cast',
+    'tensor_array_to_tensor',
+    'concat',
+    'sums',
+    'assign',
+    'fill_constant_batch_size_like',
+    'fill_constant',
+    'argmin',
+    'argmax',
+    'argsort',
+    'ones',
+    'zeros',
+    'reverse',
+    'has_inf',
+    'has_nan',
+    'isfinite',
+    'range',
+    'linspace',
+    'zeros_like',
+    'ones_like',
+    'diag',
+    'eye',
+    'triu',
 ]
 
 
diff --git a/python/paddle/fluid/net_drawer.py b/python/paddle/fluid/net_drawer.py
index f991310384f769..fd8f6eaf364c41 100644
--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
@@ -102,11 +102,11 @@ def parse_graph(program, graph, var_dict, **kwargs):
 
 def draw_graph(startup_program, main_program, **kwargs):
     if "graph_attr" in kwargs:
-        GRAPH_STYLE.update(kwargs[graph_attr])
+        GRAPH_STYLE.update(kwargs["graph_attr"])
     if "node_attr" in kwargs:
-        OP_STYLE.update(kwargs[node_attr])
+        OP_STYLE.update(kwargs["node_attr"])
     if "edge_attr" in kwargs:
-        VAR_STYLE.update(kwargs[edge_attr])
+        VAR_STYLE.update(kwargs["edge_attr"])
 
     graph_id = unique_id()
     filename = kwargs.get("filename")
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index c0b93c83f78e12..b1b6c95ea33604 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import warnings
 import numpy as np
 import six
 import os
@@ -21,6 +22,7 @@
 from collections import defaultdict
 
 import paddle
+import paddle.fluid as fluid
 from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
 from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
 
@@ -33,7 +35,6 @@
 from .initializer import Constant
 from .layer_helper import LayerHelper
 from .layers import ops
-from .regularizer import append_regularization_ops
 from .dygraph import base as imperative_base
 from .dygraph import no_grad
 from .dygraph.learning_rate_scheduler import LearningRateDecay, _LearningRateEpochDecay
@@ -257,11 +258,11 @@ def _load_state_para(state_dict, param):
 
             assert model_np.shape == load_para_np.shape,  \
                                         "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
-                                                item.name, model_np.shape, load_para_np.shape)
+                                                param.name, model_np.shape, load_para_np.shape)
 
             assert model_np.dtype == load_para_np.dtype, \
                                         "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
-                                            item.name, model_np.dtype, load_para_np.dtype)
+                                            param.name, model_np.dtype, load_para_np.dtype)
 
             tensor.set(load_para_np, framework._current_expected_place())
 
@@ -884,6 +885,93 @@ def backward(self,
                                                act_no_grad_set, callbacks)
         return params_grads
 
+    def _create_regularization_of_grad(self, param, grad, regularization=None):
+        """ Create and add backward regularization Operators
+    
+        Function helper of append_regularization_ops.
+        """
+        # If no gradient or no regularization is specified,  then we don't need to do anything
+        if grad is None or ((not hasattr(param, 'regularizer') or
+                             (hasattr(param, 'regularizer') and
+                              param.regularizer is None)) and
+                            regularization is None):
+            return grad
+        regularization_term = None
+        if hasattr(param, 'regularizer') and param.regularizer is not None:
+            # Add variable for regularization term in grad block
+            regularization_term = param.regularizer(param, grad, grad.block)
+        elif regularization is not None:
+            regularization_term = regularization(param, grad, grad.block)
+
+        assert regularization_term is not None
+
+        new_grad = grad
+        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
+            # the grad's type and name will be changed. But the gradient's name
+            # is used in ParallelExecutor Reduce mode, so I add a flag for
+            # the new_grad here.
+            new_grad = grad.block.create_var(
+                name=grad.name + core.kNewGradSuffix(),
+                dtype=param.dtype,
+                shape=param.shape,
+                lod_level=param.lod_level,
+                type=core.VarDesc.VarType.LOD_TENSOR)
+
+        inputs = {"X": [grad, regularization_term]}
+        outputs = {"Out": [new_grad]}
+        if framework.in_dygraph_mode():
+            new_grad = core.ops.sum([grad, regularization_term])
+        else:
+            grad.block.append_op(type='sum', inputs=inputs, outputs=outputs)
+
+        return new_grad
+
+    def append_regularization_ops(self,
+                                  parameters_and_grads,
+                                  regularization=None):
+        r"""Create and add backward regularization Operators
+    
+        Creates and adds backward regularization operators in the BlockDesc.
+        This will add gradients of the regularizer function to the gradients
+        of the parameters and return these modified gradients. This is the
+        same as implementing weight decay in optimizers for regularization.
+    
+        Args:
+            parameters_and_grads: A list of (parameters, gradients) pairs
+                                  that need to be regularized.
+            regularization: A global regularizer. If the parameter is not
+                            set. It will be applied with regularizer.
+    
+        Returns:
+            list[(Variable, Variable)]: list of (parameters, gradients) \
+            pair with the regularized gradient
+    
+        Raises:
+            Exception: Unknown regularization type
+        """
+        params_and_grads = []
+        if framework.in_dygraph_mode():
+            for param, grad in parameters_and_grads:
+                new_grad = self._create_regularization_of_grad(param, grad,
+                                                               regularization)
+                params_and_grads.append((param, new_grad))
+        else:
+            repeate_regularizer = False
+            with framework.name_scope('regularization'):
+                for param, grad in parameters_and_grads:
+                    if not repeate_regularizer and param.regularizer is not None and regularization is not None:
+                        repeate_regularizer = True
+                        logging.info(
+                            "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
+                            "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
+                            % regularization.__str__())
+                    with param.block.program._optimized_guard([param, grad]):
+                        new_grad = self._create_regularization_of_grad(
+                            param, grad, regularization)
+                        params_and_grads.append((param, new_grad))
+        return params_and_grads
+
     def apply_gradients(self, params_grads):
         """
         Second part of `minimize`, appending optimization operators for
@@ -916,8 +1004,8 @@ def apply_gradients(self, params_grads):
             params_grads = append_gradient_clip_ops(params_grads)
 
         # Add regularization if any
-        params_grads = append_regularization_ops(params_grads,
-                                                 self.regularization)
+        params_grads = self.append_regularization_ops(params_grads,
+                                                      self.regularization)
 
         optimize_ops = self._create_optimization_pass(params_grads)
         return optimize_ops
@@ -939,8 +1027,8 @@ def apply_optimize(self, loss, startup_program, params_grads):
                                framework.default_startup_program()):
                 if self._grad_clip is not None:
                     params_grads = self._grad_clip(params_grads)
-                params_grads = append_regularization_ops(params_grads,
-                                                         self.regularization)
+                params_grads = self.append_regularization_ops(
+                    params_grads, self.regularization)
                 optimize_ops = self._create_optimization_pass(params_grads)
         else:
             program = loss.block.program
@@ -1383,7 +1471,7 @@ def __init__(self,
             assert isinstance(
                 num_trainers, int
             ), "The type of num_trainers should be 'int', but received %s" % type(
-                value)
+                num_trainers)
             assert num_trainers > 0, "The value of num_trainers should be greater than 0!"
 
             self._num_trainers = num_trainers
@@ -1674,8 +1762,8 @@ def apply_gradients(self, params_grads):
             not_dgc_params_grads = append_gradient_clip_ops(
                 not_dgc_params_grads)
 
-        not_dgc_params_grads = append_regularization_ops(not_dgc_params_grads,
-                                                         self.regularization)
+        not_dgc_params_grads = self.append_regularization_ops(
+            not_dgc_params_grads, self.regularization)
 
         params_grads = not_dgc_params_grads + dgc_params_grads
         params_grads = sorted(params_grads, key=lambda x: x[0].name)
@@ -1725,6 +1813,9 @@ class LarsMomentumOptimizer(Optimizer):
             For details, please refer to :ref:`api_guide_Name`. Default is None.
         exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
         epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0.
+        multi_precision (bool, optional): Whether to use multi-precision during weight updating.
+        rescale_grad (float, optional): Multiply the gradient with `rescale_grad` \
+            before updating. Often choose to be `1.0/batch_size`.
         
     Examples:
         .. code-block:: python
@@ -1758,7 +1849,9 @@ def __init__(self,
                  grad_clip=None,
                  name=None,
                  exclude_from_weight_decay=None,
-                 epsilon=0):
+                 epsilon=0,
+                 multi_precision=False,
+                 rescale_grad=1.0):
         assert learning_rate is not None
         assert momentum is not None
         super(LarsMomentumOptimizer, self).__init__(
@@ -1776,16 +1869,70 @@ def __init__(self,
             self._exclude_from_weight_decay = []
         else:
             self._exclude_from_weight_decay = exclude_from_weight_decay
+        self._multi_precision = multi_precision
+        self._rescale_grad = float(rescale_grad)
+        self._master_weights = {}
+
+    def _create_master_weight(self, param):
+        assert isinstance(self.helper, LayerHelper)
+
+        var_name = param.name + '_fp32_master'
+        var_name = unique_name.generate(var_name)
+        var = layers.create_global_var(
+            name=var_name,
+            shape=param.shape,
+            value=0,
+            dtype='float32',
+            persistable=True)
+        block = self.helper.startup_program.global_block()
+        block.append_op(
+            type="cast",
+            inputs={"X": [param]},
+            outputs={"Out": [var]},
+            attrs={
+                "in_dtype": param.dtype,
+                "out_dtype": core.VarDesc.VarType.FP32
+            })
+        self._master_weights[param.name] = var
+        return var
+
+    def _get_accumulator(self, name, param):
+        """Utility function to fetch an accumulator for a parameter
+        Args:
+            name: name of the accumulator
+            param: parameter variable for which accumulator is to be fetched
+        Returns:
+            accumulator variable for the parameter
+        """
+        if self._name is not None:
+            name = self._name + "_" + name
+        find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
+        target_param = self._master_weights[
+            param.name] if find_master else param
+        target_name = target_param.name
+        if (name not in self._accumulators or
+                target_name not in self._accumulators[name]):
+            raise Exception("Accumulator {} does not exist for parameter {}".
+                            format(name, target_name))
+        return self._accumulators[name][target_name]
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
         for p in parameters:
+            if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
+                master_p = self._create_master_weight(p)
+                self._add_accumulator(self._velocity_acc_str, master_p)
+                continue
+            if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
+                warnings.warn(
+                    "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
+                    "Consider using multi_precision=True option of the Lars optimizer."
+                )
             self._add_accumulator(self._velocity_acc_str, p)
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
-
         _lars_weight_decay = self._lars_weight_decay
         param_name = param_and_grad[0].name
         if len(self._exclude_from_weight_decay) > 0:
@@ -1796,25 +1943,40 @@ def _append_optimize_op(self, block, param_and_grad):
 
         velocity_acc = self._get_accumulator(self._velocity_acc_str,
                                              param_and_grad[0])
+        lr = self._create_param_lr(param_and_grad)
+
+        find_master = self._multi_precision and param_and_grad[
+            0].dtype == core.VarDesc.VarType.FP16
+        master_weight = (self._master_weights[param_and_grad[0].name]
+                         if find_master else None)
+
+        attrs = {
+            "mu": self._momentum,
+            "lars_coeff": self._lars_coeff,
+            "lars_weight_decay": _lars_weight_decay,
+            "multi_precision": find_master,
+            "rescale_grad": self._rescale_grad
+        }
+
+        inputs = {
+            "Param": param_and_grad[0],
+            "Grad": param_and_grad[1],
+            "Velocity": velocity_acc,
+            "LearningRate": lr
+        }
+
+        outputs = {"ParamOut": param_and_grad[0], "VelocityOut": velocity_acc}
+
+        if find_master:
+            inputs["MasterParam"] = master_weight
+            outputs["MasterParamOut"] = master_weight
+
         # create the momentum optimize op
         momentum_op = block.append_op(
             type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "Velocity": velocity_acc,
-                "LearningRate": self._create_param_lr(param_and_grad)
-            },
-            outputs={
-                "ParamOut": param_and_grad[0],
-                "VelocityOut": velocity_acc
-            },
-            attrs={
-                "mu": self._momentum,
-                "lars_coeff": self._lars_coeff,
-                "lars_weight_decay": _lars_weight_decay,
-                "epsilon": self._epsilon
-            },
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
             stop_gradient=True)
 
         return momentum_op
@@ -4010,6 +4172,7 @@ def _insert_allreduce_op(self, op_idx, block):
                     'out_dtype': out_var.dtype,
                     self._op_role_key: self._op_role.Optimize
                 })
+            offset += 1
         return offset
 
     def _create_vars(self, block, ori_block):
@@ -4522,12 +4685,15 @@ def _insert_send_recv(cur_id, prev_id):
                                 'ring_id': ring_id
                             })
                         extra_index_info['index'] += 1
+                        var_shape = list(var.shape)
+                        var_shape[0] = self.micro_batch_size if var_shape[
+                            0] < 0 else var_shape[0]
                         block._insert_op_without_sync(
                             index=index + extra_index_info['index'],
                             type='recv_v2',
                             outputs={'Out': [var]},
                             attrs={
-                                'out_shape': var.shape,
+                                'out_shape': var_shape,
                                 'dtype': var.dtype,
                                 self._op_device_key: cur_dev,
                                 self._op_role_key: op_role,
diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 9f2b2127aa7043..616daf5a650413 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -1291,7 +1291,7 @@ def __thread_main__(legacy_expected_place):
             except Exception as ex:
                 self._queue.kill()
                 self._thread = None
-                logging.warn('Your reader has raised an exception!')
+                logging.warning('Your reader has raised an exception!')
                 six.reraise(*sys.exc_info())
 
         self._thread = threading.Thread(
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 64ce283a63c5bf..64bbca6c57c540 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -22,92 +22,6 @@
 __all__ = ['L1Decay', 'L2Decay', 'L1DecayRegularizer', 'L2DecayRegularizer']
 
 
-def _create_regularization_of_grad(param, grad, regularization=None):
-    """ Create and add backward regularization Operators
-
-    Function helper of append_regularization_ops.
-    """
-    # If no gradient or no regularization is specified,  then we don't need to do anything
-    if grad is None or ((not hasattr(param, 'regularizer') or (
-            hasattr(param, 'regularizer') and param.regularizer is None)) and
-                        regularization is None):
-        return grad
-    regularization_term = None
-    if hasattr(param, 'regularizer') and param.regularizer is not None:
-        # Add variable for regularization term in grad block
-        regularization_term = param.regularizer(param, grad, grad.block)
-    elif regularization is not None:
-        regularization_term = regularization(param, grad, grad.block)
-
-    assert regularization_term is not None
-
-    new_grad = grad
-    if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
-        # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
-        # the grad's type and name will be changed. But the gradient's name
-        # is used in ParallelExecutor Reduce mode, so I add a flag for
-        # the new_grad here.
-        new_grad = grad.block.create_var(
-            name=grad.name + core.kNewGradSuffix(),
-            dtype=param.dtype,
-            shape=param.shape,
-            lod_level=param.lod_level,
-            type=core.VarDesc.VarType.LOD_TENSOR)
-
-    inputs = {"X": [grad, regularization_term]}
-    outputs = {"Out": [new_grad]}
-    if in_dygraph_mode():
-        new_grad = core.ops.sum([grad, regularization_term])
-    else:
-        grad.block.append_op(type='sum', inputs=inputs, outputs=outputs)
-
-    return new_grad
-
-
-def append_regularization_ops(parameters_and_grads, regularization=None):
-    r"""Create and add backward regularization Operators
-
-    Creates and adds backward regularization operators in the BlockDesc.
-    This will add gradients of the regularizer function to the gradients
-    of the parameters and return these modified gradients. This is the
-    same as implementing weight decay in optimizers for regularization.
-
-    Args:
-        parameters_and_grads: A list of (parameters, gradients) pairs
-                              that need to be regularized.
-        regularization: A global regularizer. If the parameter is not
-                        set. It will be applied with regularizer.
-
-    Returns:
-        list[(Variable, Variable)]: list of (parameters, gradients) \
-        pair with the regularized gradient
-
-    Raises:
-        Exception: Unknown regularization type
-    """
-    params_and_grads = []
-    if in_dygraph_mode():
-        for param, grad in parameters_and_grads:
-            new_grad = _create_regularization_of_grad(param, grad,
-                                                      regularization)
-            params_and_grads.append((param, new_grad))
-    else:
-        repeate_regularizer = False
-        with framework.name_scope('regularization'):
-            for param, grad in parameters_and_grads:
-                if not repeate_regularizer and param.regularizer is not None and regularization is not None:
-                    repeate_regularizer = True
-                    logging.info(
-                        "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
-                        "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
-                        % regularization.__str__())
-                with param.block.program._optimized_guard([param, grad]):
-                    new_grad = _create_regularization_of_grad(param, grad,
-                                                              regularization)
-                    params_and_grads.append((param, new_grad))
-    return params_and_grads
-
-
 class WeightDecayRegularizer(object):
     """Base class for weight decay regularizers
 
diff --git a/python/paddle/fluid/tests/test_beam_search_decoder.py b/python/paddle/fluid/tests/test_beam_search_decoder.py
index 69f3ff46b3ac9c..301bd0ff0039e0 100644
--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
@@ -246,7 +246,7 @@ def f(*args):
 
 
 def inject_test_decode(use_cuda, decorator=None):
-    f_name = 'test_{0}_decode'.format('cuda' if use_cuda else 'cpu', 'sparse')
+    f_name = 'test_{0}_decode'.format('cuda' if use_cuda else 'cpu')
 
     def f(*args):
         with scope_prog_guard():
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 18f99665e2b4da..03aaf7ed03e26d 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -80,6 +80,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_c_split)
     LIST(REMOVE_ITEM TEST_OPS test_allgather)
     LIST(REMOVE_ITEM TEST_OPS test_c_identity)
+    LIST(REMOVE_ITEM TEST_OPS test_c_embedding_op)
     LIST(REMOVE_ITEM TEST_OPS test_allreduce)
     LIST(REMOVE_ITEM TEST_OPS test_broadcast)
     LIST(REMOVE_ITEM TEST_OPS test_collective_reduce)
@@ -103,6 +104,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_collective_sendrecv_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_wait)
     LIST(REMOVE_ITEM TEST_OPS test_memcpy_op)
+    LIST(REMOVE_ITEM TEST_OPS test_raw_program_optimizer)
 endif()
 
 if(WIN32)
@@ -576,7 +578,7 @@ endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf)
 # Coverage pipeline use cuda 10.1 now, profiler will random hang in cuda 10.1,
 # see https://github.com/PaddlePaddle/Paddle/issues/29082 for details.
-# We guess there are some bugs in cuda 10.1 or 10.2, 
+# We guess there are some bugs in cuda 10.1 or 10.2,
 # since this unittest is stable in cuda 11 (py3 pipeline) now.
 if(NOT WITH_COVERAGE)
   py_test_modules(test_parallel_executor_profiler MODULES test_parallel_executor_profiler)
@@ -601,8 +603,8 @@ py_test_modules(test_fuse_bn_act_pass MODULES test_fuse_bn_act_pass ENVS FLAGS_c
 py_test_modules(test_fuse_bn_add_act_pass MODULES test_fuse_bn_add_act_pass ENVS FLAGS_cudnn_deterministic=1 FLAGS_cudnn_batchnorm_spatial_persistent=1 FLAGS_conv_workspace_size_limit=1000)
 
 # NOTE: These unittests will appear NaN steadily in windows CI. After analysis,
-# it is found that windows CI will run all the training unittests with the ON_INFER option turned on, 
-# which will not appear in other CIs. The calculation behavior of some ops in inference mode is 
+# it is found that windows CI will run all the training unittests with the ON_INFER option turned on,
+# which will not appear in other CIs. The calculation behavior of some ops in inference mode is
 # inconsistent with that in non-inference mode.
 if(NOT ON_INFER)
     py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES test_parallel_executor_seresnext_base_cpu)
@@ -645,7 +647,7 @@ if (WITH_XPU)
     add_subdirectory(xpu)
 endif()
 
-# dist xpu tests: 
+# dist xpu tests:
 if (WITH_XPU_BKCL)
     py_test(test_collective_reduce_api_xpu SRCS "test_collective_reduce_api.py")
     py_test(test_collective_allreduce_api_xpu SRCS "test_collective_allreduce_api.py")
@@ -659,6 +661,8 @@ if (WITH_MKLDNN)
     add_subdirectory(mkldnn)
 endif()
 
+add_subdirectory(asp)
+
 add_subdirectory(ir)
 
 if (WITH_TESTING)
@@ -713,6 +717,7 @@ if (WITH_DISTRIBUTE)
     set_tests_properties(test_dist_fleet_ctr2 PROPERTIES TIMEOUT 200)
     set_tests_properties(test_dist_fleet_sparse_embedding_ctr PROPERTIES TIMEOUT 200)
     set_tests_properties(test_dist_fleet_infer PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_dist_fleet_raw_program_optimizer PROPERTIES TIMEOUT 120)
 endif()
 
 if (WITH_DISTRIBUTE AND NOT APPLE)
@@ -849,7 +854,7 @@ set_tests_properties(test_dygraph_multi_forward PROPERTIES TIMEOUT 120)
 set_tests_properties(test_norm_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_ocr_attention_model PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_mnist PROPERTIES TIMEOUT 120)
-set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 150)
+set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 270)
 set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
 set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
 set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
diff --git a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
new file mode 100644
index 00000000000000..f71e04c09aa38b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/asp/__init__.py b/python/paddle/fluid/tests/unittests/asp/__init__.py
new file mode 100644
index 00000000000000..4c551792f989c0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
new file mode 100644
index 00000000000000..370d73cc35a43a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/asp_pruning_base.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import threading, time
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib import sparsity
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+
+paddle.enable_static()
+
+
+class TestASPHelperPruningBase(unittest.TestCase):
+    def setUp(self):
+        self.main_program = fluid.Program()
+        self.startup_program = fluid.Program()
+
+        def build_model():
+            img = fluid.data(
+                name='img', shape=[None, 3, 32, 32], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = fluid.layers.conv2d(
+                input=img, num_filters=4, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.fc(input=hidden, size=32, act='relu')
+            prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+            return img, label, prediction
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.img, self.label, self.predict = build_model()
+
+    def run_inference_pruning_test(self, get_mask_gen_func,
+                                   get_mask_check_func):
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = fluid.Executor(place)
+
+        self.__pruning_and_checking(exe, place, get_mask_gen_func,
+                                    get_mask_check_func, False)
+
+    def run_training_pruning_test(self, get_mask_gen_func, get_mask_check_func):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=self.predict, label=self.label))
+            optimizer = sparsity.decorate(
+                fluid.optimizer.SGD(learning_rate=0.01))
+            optimizer.minimize(loss, self.startup_program)
+
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = fluid.Executor(place)
+
+        self.__pruning_and_checking(exe, place, get_mask_gen_func,
+                                    get_mask_check_func, True)
+
+    def __pruning_and_checking(self, exe, place, mask_func_name,
+                               check_func_name, with_mask):
+        exe.run(self.startup_program)
+        sparsity.prune_model(
+            place,
+            self.main_program,
+            func_name=mask_func_name,
+            with_mask=with_mask)
+        for param in self.main_program.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(self.main_program, param.name):
+                mat = np.array(fluid.global_scope().find_var(param.name)
+                               .get_tensor())
+                self.assertTrue(
+                    sparsity.check_sparsity(
+                        mat.T, func_name=check_func_name, n=2, m=4))
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py
new file mode 100644
index 00000000000000..402861ad5d9312
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_optimize.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import threading, time
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.contrib import sparsity
+from paddle.fluid.contrib.sparsity.asp import ASPHelper
+import numpy as np
+
+paddle.enable_static()
+
+
+class TestASPHelper(unittest.TestCase):
+    def setUp(self):
+        self.main_program = fluid.Program()
+        self.startup_program = fluid.Program()
+
+        def build_model():
+            img = fluid.data(
+                name='img', shape=[None, 3, 32, 32], dtype='float32')
+            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+            hidden = fluid.layers.conv2d(
+                input=img, num_filters=4, filter_size=3, padding=2, act="relu")
+            hidden = fluid.layers.fc(input=hidden, size=32, act='relu')
+            prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+            return img, label, prediction
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.img, self.label, predict = build_model()
+            self.loss = fluid.layers.mean(
+                fluid.layers.cross_entropy(
+                    input=predict, label=self.label))
+            self.optimizer = fluid.optimizer.SGD(learning_rate=0.01)
+
+    def test_get_not_ASP_relevant_vars(self):
+        def check_params(params, params_from_asp):
+            if len(params_from_asp) != len(params):
+                return False
+
+            for i, p in enumerate(params_from_asp):
+                if p.name != params[i].name:
+                    return False
+            return True
+
+        params = self.main_program.global_block().all_parameters()
+        params_from_asp = ASPHelper._get_not_ASP_relevant_vars(
+            self.main_program)
+        self.assertTrue(check_params(params, params_from_asp))
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            ASPHelper._minimize(self.optimizer, self.loss, self.main_program,
+                                self.startup_program)
+        params_from_asp_after_opt = ASPHelper._get_not_ASP_relevant_vars(
+            self.main_program)
+        self.assertTrue(check_params(params, params_from_asp_after_opt))
+
+    def test_is_supported_layers(self):
+        program = paddle.static.default_main_program()
+
+        names = [
+            'embedding_0.w_0', 'fack_layer_0.w_0', 'conv2d_0.w_0',
+            'conv2d_0.b_0', 'conv2d_1.w_0', 'conv2d_1.b_0', 'fc_0.w_0',
+            'fc_0.b_0', 'fc_1.w_0', 'fc_1.b_0', 'linear_2.w_0', 'linear_2.b_0'
+        ]
+        ref = [
+            False, False, True, False, True, False, True, False, True, False,
+            True, False
+        ]
+        for i, name in enumerate(names):
+            self.assertTrue(
+                ref[i] == ASPHelper._is_supported_layer(program, name))
+
+        sparsity.set_excluded_layers(program, ['fc_1', 'conv2d_0'])
+        ref = [
+            False, False, False, False, True, False, True, False, False, False,
+            True, False
+        ]
+        for i, name in enumerate(names):
+            self.assertTrue(
+                ref[i] == ASPHelper._is_supported_layer(program, name))
+
+        sparsity.reset_excluded_layers(program)
+        ref = [
+            False, False, True, False, True, False, True, False, True, False,
+            True, False
+        ]
+        for i, name in enumerate(names):
+            self.assertTrue(
+                ref[i] == ASPHelper._is_supported_layer(program, name))
+
+    def test_decorate(self):
+        param_names = self.__get_param_names(self.main_program.global_block()
+                                             .all_parameters())
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.optimizer = sparsity.decorate(self.optimizer)
+            self.optimizer.minimize(self.loss, self.startup_program)
+        param_names_after_minimize = self.__get_param_names(
+            self.main_program.global_block().all_parameters())
+
+        self.__check_mask_variables_and_ops(param_names,
+                                            param_names_after_minimize)
+
+    def test_asp_training(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.optimizer = sparsity.decorate(self.optimizer)
+            self.optimizer.minimize(self.loss, self.startup_program)
+
+        place = paddle.CPUPlace()
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=[self.img, self.label], place=place)
+
+        exe.run(self.startup_program)
+        sparsity.prune_model(place, self.main_program)
+
+        data = (np.random.randn(64, 3, 32, 32), np.random.randint(
+            10, size=(64, 1)))
+        exe.run(self.main_program, feed=feeder.feed([data]))
+
+        for param in self.main_program.global_block().all_parameters():
+            if ASPHelper._is_supported_layer(self.main_program, param.name):
+                mat = np.array(fluid.global_scope().find_var(param.name)
+                               .get_tensor())
+                self.assertTrue(sparsity.check_sparsity(mat.T, n=2, m=4))
+
+    def test_asp_training_with_amp(self):
+        if core.is_compiled_with_cuda():
+            place = paddle.CUDAPlace(0)
+            with fluid.program_guard(self.main_program, self.startup_program):
+                self.optimizer = fluid.contrib.mixed_precision.decorator.decorate(
+                    self.optimizer)
+                self.optimizer = sparsity.decorate(self.optimizer)
+                self.optimizer.minimize(self.loss, self.startup_program)
+
+            exe = fluid.Executor(place)
+            feeder = fluid.DataFeeder(
+                feed_list=[self.img, self.label], place=place)
+
+            exe.run(self.startup_program)
+            sparsity.prune_model(place, self.main_program)
+
+            data = (np.random.randn(64, 3, 32, 32), np.random.randint(
+                10, size=(64, 1)))
+            exe.run(self.main_program, feed=feeder.feed([data]))
+
+            for param in self.main_program.global_block().all_parameters():
+                if ASPHelper._is_supported_layer(self.main_program, param.name):
+                    mat = np.array(fluid.global_scope().find_var(param.name)
+                                   .get_tensor())
+                    self.assertTrue(sparsity.check_sparsity(mat.T, n=2, m=4))
+
+    def __get_param_names(self, params):
+        param_names = []
+        for p in params:
+            param_names.append(p.name)
+        return param_names
+
+    def __check_mask_variables_and_ops(self, param_names,
+                                       param_names_after_minimize):
+        for n in param_names:
+            self.assertFalse(ASPHelper._is_supported_layer(self.main_program, n) and \
+               ASPHelper._get_mask_name(n) not in param_names_after_minimize)
+
+        mask_names = []
+        for n in param_names:
+            if ASPHelper._is_supported_layer(self.main_program, n):
+                mask_names.append(ASPHelper._get_mask_name(n))
+
+        masking_ops = []
+        for op in self.main_program.global_block().ops:
+            if op.type == 'elementwise_mul' and \
+               op.input('Y')[0] in mask_names:
+                masking_ops.append(op.input('Y')[0])
+
+        self.assertTrue(len(masking_ops) == len(mask_names))
+        for n in masking_ops:
+            self.assertTrue(n in mask_names)
+
+        for n in mask_names:
+            self.assertTrue(n in masking_ops)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py
new file mode 100644
index 00000000000000..ee4b2c002f5afa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_1d.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+from paddle.fluid.contrib import sparsity
+from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
+
+paddle.enable_static()
+
+
+class TestASPHelperPruning1D(TestASPHelperPruningBase):
+    def test_1D_inference_pruning(self):
+        self.run_inference_pruning_test(sparsity.MaskAlgo.MASK_1D,
+                                        sparsity.CheckMethod.CHECK_1D)
+
+    def test_1D_training_pruning(self):
+        self.run_training_pruning_test(sparsity.MaskAlgo.MASK_1D,
+                                       sparsity.CheckMethod.CHECK_1D)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py
new file mode 100644
index 00000000000000..1b8b1e4a06ae4c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_best.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+from paddle.fluid.contrib import sparsity
+from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
+
+paddle.enable_static()
+
+
+class TestASPHelperPruning2DBest(TestASPHelperPruningBase):
+    def test_2D_best_inference_pruning(self):
+        self.run_inference_pruning_test(sparsity.MaskAlgo.MASK_2D_BEST,
+                                        sparsity.CheckMethod.CHECK_2D)
+
+    def test_2D_best_training_pruning(self):
+        self.run_training_pruning_test(sparsity.MaskAlgo.MASK_2D_BEST,
+                                       sparsity.CheckMethod.CHECK_2D)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py
new file mode 100644
index 00000000000000..4bdd310f0209a9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_pruning_2d_greedy.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 NVIDIA Corporation.  All rights reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+from paddle.fluid.contrib import sparsity
+from paddle.fluid.tests.unittests.asp.asp_pruning_base import TestASPHelperPruningBase
+
+paddle.enable_static()
+
+
+class TestASPHelperPruning2DGreedy(TestASPHelperPruningBase):
+    def test_2D_greedy_inference_pruning(self):
+        self.run_inference_pruning_test(sparsity.MaskAlgo.MASK_2D_GREEDY,
+                                        sparsity.CheckMethod.CHECK_2D)
+
+    def test_2D_greedy_training_pruning(self):
+        self.run_training_pruning_test(sparsity.MaskAlgo.MASK_2D_GREEDY,
+                                       sparsity.CheckMethod.CHECK_2D)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_asp_utils.py b/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
similarity index 94%
rename from python/paddle/fluid/tests/unittests/test_asp_utils.py
rename to python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
index faffd477ae5661..387cb55e5c3cfd 100644
--- a/python/paddle/fluid/tests/unittests/test_asp_utils.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_asp_utils.py
@@ -39,9 +39,9 @@ def test_density(self):
         x = np.array([[1.0, 1.0, 1.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
                       [1.0, 0.0, 0.0, 0.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
                       [0.0, 1.0, 0.0, 0.0, 1.0]])
-        self.assertEqual(sparsity.density(x), 0.56)
+        self.assertEqual(sparsity.calculate_density(x), 0.56)
         x[:, 0] = 0.0
-        self.assertEqual(sparsity.density(x), 0.4)
+        self.assertEqual(sparsity.calculate_density(x), 0.4)
 
     def test_check_mask_1d(self):
         x = np.array([[1.0, 0.0, 0.0, 1.0, 1.0], [1.0, 1.0, 0.0, 0.0, 1.0],
@@ -114,11 +114,11 @@ def get_reference(m=4, n=2):
         for _ in range(4):
             computing_thread = threading.Thread(
                 target=paddle.fluid.contrib.sparsity.utils.
-                compute_valid_2d_patterns,
+                _compute_valid_2d_patterns,
                 args=(2, 4))
             computing_thread.start()
         time.sleep(3)
-        patterns_map = paddle.fluid.contrib.sparsity.utils.valid_2d_patterns
+        patterns_map = paddle.fluid.contrib.sparsity.utils._valid_2d_patterns
         reference_patterns = get_reference()
         reference_key = '4_2'
 
diff --git a/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py b/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py
new file mode 100644
index 00000000000000..08bab306df1b11
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/check_nan_inf_base_dygraph.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import os
+import sys
+import time
+import numpy as np
+
+os.environ[str("FLAGS_check_nan_inf")] = str("1")
+os.environ[str("GLOG_vmodule")] = str("nan_inf_utils_detail=10")
+
+import paddle
+import paddle.nn as nn
+
+np.random.seed(0)
+
+
+def generator():
+    batch_size = 5
+    for i in range(5):
+        curr_train_x = np.random.randint(
+            batch_size, size=(batch_size, 3)).astype("float32")
+        if i >= 2:
+            curr_train_x[0, :] = np.nan
+            curr_train_x[-1, :] = np.inf
+        res = []
+        for i in range(batch_size):
+            y = i % 3
+            res.append([y])
+        y_label = np.array(res).astype('int64')
+        yield [curr_train_x, y_label]
+
+
+class TestLayer(nn.Layer):
+    def __init__(self):
+        super(TestLayer, self).__init__()
+        self.linear1 = nn.Linear(3, 400)
+        self.linear2 = nn.Linear(400, 400)
+        self.linear3 = nn.Linear(400, 3)
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = nn.functional.sigmoid(x)
+        x = self.linear2(x)
+        x = nn.functional.sigmoid(x)
+        x = self.linear3(x)
+        x = nn.functional.softmax(x)
+
+        return x
+
+
+def check(use_cuda):
+    paddle.set_device('gpu' if use_cuda else 'cpu')
+
+    net = TestLayer()
+    sgd = paddle.optimizer.SGD(learning_rate=0.05, parameters=net.parameters())
+
+    for step, (x, y) in enumerate(generator()):
+        x = paddle.to_tensor(x)
+        y = paddle.to_tensor(y)
+
+        zero = paddle.zeros(shape=[1], dtype='int64')
+        fp16_zero = paddle.cast(zero, dtype='float16')
+
+        y = y + zero
+
+        y_pred = net(x)
+
+        cost = nn.functional.cross_entropy(y_pred, y, use_softmax=False)
+        avg_cost = paddle.mean(cost)
+
+        acc_top1 = paddle.metric.accuracy(input=y_pred, label=y, k=1)
+
+        print('iter={:.0f}, cost={}, acc1={}'.format(
+            step, avg_cost.numpy(), acc_top1.numpy()))
+
+        sgd.step()
+        sgd.clear_grad()
+
+
+if __name__ == '__main__':
+    if paddle.is_compiled_with_cuda():
+        try:
+            check(use_cuda=True)
+            assert False
+        except Exception as e:
+            print(e)
+            print(type(e))
+            # Note. Enforce in cuda kernel may not catch in paddle, and
+            # Exception type will be RuntimeError
+            assert type(e) == OSError or type(e) == RuntimeError
+    try:
+        check(use_cuda=False)
+        assert False
+    except Exception as e:
+        print(e)
+        print(type(e))
+        assert type(e) == RuntimeError
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
new file mode 100644
index 00000000000000..575c07390a35bb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test_dist_base import TestDistRunnerBase, runtime_main
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import numpy as np
+from functools import reduce
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu",
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu",
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
+
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01)))
+    return predict
+
+
+class TestFleetMetaOptimizerPrecision(TestDistRunnerBase):
+    def get_model(self, batch_size=2, single_device=False):
+        # Input data
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        # Train program
+        predict = cnn_model(images)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        # Evaluator
+        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_acc = fluid.layers.accuracy(
+            input=predict, label=label, total=batch_size_tensor)
+
+        test_program = fluid.default_main_program().clone(for_test=True)
+
+        # Reader
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+
+        optimizer = paddle.fluid.optimizer.Adam(0.01)
+        if single_device:
+            optimizer.minimize(avg_cost)
+        else:
+            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+            fleet.init(role)
+            strategy = paddle.distributed.fleet.DistributedStrategy()
+            strategy.without_graph_optimization = True
+            optimizer = fleet.distributed_optimizer(
+                optimizer, strategy=strategy)
+            optimizer.minimize(avg_cost)
+
+        return test_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+if __name__ == "__main__":
+    runtime_main(TestFleetMetaOptimizerPrecision)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
index 4f35befda8e2cd..affec2f7dfefc3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -18,6 +18,7 @@
 from functools import reduce
 from paddle.fluid.dygraph import declarative, to_variable
 from paddle.fluid.dygraph import Embedding, Layer, Linear
+from paddle.static import Variable
 
 
 class EmbeddingLayer(object):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
index e0b7e9033dd5e6..5cbaeb0f4046e3 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py
@@ -14,6 +14,7 @@
 
 from functools import reduce
 import paddle
+from paddle.static import Variable
 
 
 class EmbeddingLayer(object):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grad.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grad.py
new file mode 100644
index 00000000000000..ab87beb9e10172
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grad.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import unittest
+
+
+class GradLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(GradLayer, self).__init__()
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        x.stop_gradient = False
+        y = x * x
+        dx = paddle.grad(outputs=[y], inputs=[x])[0]
+        return dx
+
+
+class GradLinearLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(GradLinearLayer, self).__init__()
+        self.linear = paddle.nn.Linear(5, 5, bias_attr=False)
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        x.stop_gradient = False
+        tmp = x + x
+        for i in range(10):
+            tmp = self.linear(tmp)
+        out = tmp
+        dx = paddle.grad(
+            [out], [x], None, create_graph=True, allow_unused=False)[0]
+        return dx
+
+
+class TestGrad(unittest.TestCase):
+    def setUp(self):
+        self.func = GradLayer()
+        self.x = paddle.ones(shape=[10, 2, 5], dtype='float32')
+        self.x.stop_gradient = False
+
+    def _run(self, func, to_static):
+        prog_trans = paddle.jit.ProgramTranslator()
+        prog_trans.enable(to_static)
+        ret = func(self.x).numpy()
+        prog_trans.enable(True)
+        return ret
+
+    def test_forward(self):
+        dygraph_res = self._run(self.func, to_static=False)
+        static_res = self._run(self.func, to_static=True)
+        self.assertTrue(np.allclose(static_res, dygraph_res))
+
+
+class TestGradLinear(TestGrad):
+    def setUp(self):
+        self.func = GradLinearLayer()
+        self.x = paddle.ones(shape=[10, 2, 5], dtype='float32')
+        self.x.stop_gradient = False
+
+    def test_save_infer_program(self):
+        path = "double_grad_infer_model"
+        input_spec = [
+            paddle.static.InputSpec(
+                shape=[10, 2, 5], dtype='float32')
+        ]
+        paddle.jit.save(self.func, path, input_spec=input_spec)
+        load_func = paddle.jit.load(path)
+
+        origin_res = self.func(self.x).numpy()
+        load_res = load_func(self.x).numpy()
+        self.assertTrue(np.allclose(origin_res, load_res))
+
+    def test_save_train_program(self):
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
+        optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                         grad_clip=grad_clip,
+                                         parameters=self.func.parameters())
+        for i in range(10):
+            out = self.func(self.x)
+            avg_loss = paddle.mean(paddle.abs(out - 1))
+            avg_loss.backward()
+            optimizer.minimize(avg_loss)
+            self.func.clear_gradients()
+
+        path = "double_grad_train_model"
+        paddle.jit.save(self.func, path)
+        load_func = paddle.jit.load(path)
+
+        origin_res = self.func(self.x).numpy()
+        load_res = load_func(self.x).numpy()
+        self.assertTrue(np.allclose(origin_res, load_res))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py
new file mode 100644
index 00000000000000..a39b5d7cd1a44b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_op_attr.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import paddle
+import unittest
+import numpy as np
+
+from paddle.static import InputSpec
+
+
+class MySub(paddle.nn.Layer):
+    def __init__(self):
+        super(MySub, self).__init__()
+
+    def forward(self, x, y, name=None):
+        return paddle.subtract(x, y, name)
+
+
+class NetWithOpAttr(paddle.nn.Layer):
+    def __init__(self, in_num, out_num):
+        super(NetWithOpAttr, self).__init__()
+
+        self.linear = paddle.nn.Linear(in_num, out_num)
+        self.bn = paddle.nn.BatchNorm(out_num)
+        self.sub = MySub()
+
+    def forward(self, x):
+        out = self.linear(x)
+        out = self.sub(out, x)
+        out = self.bn(out)
+        return out
+
+    @paddle.jit.to_static(input_spec=[InputSpec([10, 16])])
+    def with_cond(self, x):
+        if paddle.mean(x) > 0.:
+            out = self.linear(x)
+        else:
+            out = self.sub(x, x)
+        out = self.bn(out)
+        return out
+
+
+class CheckOpAttr(unittest.TestCase):
+    def setUp(self):
+        self.in_num = 16
+        self.out_num = 16
+        self.x = paddle.randn([10, self.in_num])
+        self.expected_results()
+
+    def expected_results(self):
+        self.fc_attrs = {
+            "int_val": 10,
+            "int_vals": [10, 20],
+            "float_val": 3.8,
+            "float_vals": [3.8, -0.2]
+        }
+        self.bn_attrs = {"bool_val": True, "bool_vals": [True, False]}
+        self.sub_attrs = {"int_vals": [10, 20], "bool_vals": [True, False]}
+
+        self.infos = {
+            'matmul': self.fc_attrs,
+            'elementwise_add': self.fc_attrs,
+            'batch_norm': self.bn_attrs,
+            'tanh': self.bn_attrs,
+            'elementwise_sub': self.sub_attrs
+        }
+
+    def test_set_op_attrs(self):
+        net = NetWithOpAttr(self.in_num, self.out_num)
+        # set attrs
+        net.linear._set_op_attrs(self.fc_attrs)
+        net.bn._set_op_attrs({"bool_val": False})  # test overwrite behavior
+        net.bn._set_op_attrs(self.bn_attrs)
+        net.sub._set_op_attrs(self.sub_attrs)
+        # assert hooks exist.
+        self.assertEqual(len(net.linear._forward_pre_hooks), 1)
+        self.assertEqual(len(net.linear._forward_post_hooks), 1)
+        # to_static
+        net = paddle.jit.to_static(
+            net, input_spec=[InputSpec.from_tensor(self.x)])
+
+        # assert attrs have be set.
+        self.check_op_attrs(net.forward.concrete_program.main_program)
+
+        # assert hooks have be clean.
+        self.assertEqual(len(net.linear._forward_pre_hooks), 0)
+        self.assertEqual(len(net.linear._forward_post_hooks), 0)
+
+    def check_op_attrs(self, main_program):
+        for cur_block in main_program.blocks:
+            ops = cur_block.ops
+            for op in ops:
+                if op.type not in self.infos: continue
+                for attr_name, expect_vals in six.iteritems(self.infos[
+                        op.type]):
+                    op_vals = op.desc.attr(attr_name)
+                    if not isinstance(expect_vals, list):
+                        expect_vals = [expect_vals]
+                        op_vals = [op_vals]
+
+                    for (op_val, expect_val) in zip(op_vals, expect_vals):
+                        if isinstance(op_val, float):
+                            # C++ vs python: 3.799999952316284 ~= 3.8
+                            self.assertAlmostEqual(op_val, expect_val)
+                        else:
+                            self.assertEqual(op_val, expect_val)
+
+    def test_set_op_attrs_with_sub_block(self):
+        net = NetWithOpAttr(self.in_num, self.out_num)
+        # set attrs
+        net.linear._set_op_attrs({
+            "int_vals": [0, 0]
+        })  # test overwrite behavior
+        net.linear._set_op_attrs(self.fc_attrs)
+        net.bn._set_op_attrs(self.bn_attrs)
+        net.sub._set_op_attrs(self.sub_attrs)
+        # assert hooks exist.
+        self.assertEqual(len(net.linear._forward_pre_hooks), 1)
+        self.assertEqual(len(net.linear._forward_post_hooks), 1)
+
+        # assert attrs have be set.
+        self.check_op_attrs(net.with_cond.concrete_program.main_program)
+
+        # assert hooks have be clean.
+        self.assertEqual(len(net.linear._forward_pre_hooks), 0)
+        self.assertEqual(len(net.linear._forward_post_hooks), 0)
+
+    def test_type_error(self):
+        net = NetWithOpAttr(self.in_num, self.out_num)
+        # attrs should be dict
+        with self.assertRaises(TypeError):
+            net.linear._set_op_attrs([self.fc_attrs])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
index e72688d800ba59..0fffb0c985375b 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py
@@ -17,6 +17,7 @@
 import gast
 import inspect
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import unittest
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
index 70749c2e24447e..f7cdb12a1ab673 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py
@@ -29,10 +29,10 @@ def dyfunc_tensor_shape_1(x):
 
 
 def dyfunc_tensor_shape_2(x):
-    x = fluid.dygraph.to_variable(x)
+    x = paddle.to_tensor(x)
     shape = x.shape
     shape2 = shape
-    res = fluid.layers.reshape(x, shape2)
+    res = paddle.reshape(x, shape2)
     return res
 
 
@@ -85,6 +85,13 @@ def dyfunc_tuple_shape_2(x):
     return res
 
 
+def dyfunc_tuple_shape_3(x):
+    x = paddle.to_tensor(x)
+    a, b = paddle.shape(x)
+    res = paddle.reshape(x, shape=(b, a))
+    return res
+
+
 def dyfunc_paddle_shape_api(x):
     x = paddle.to_tensor(x)
     # paddle.shape will not be converted.
@@ -190,7 +197,7 @@ def dyfunc_with_while_3(x):
 
 
 def dyfunc_with_while_4(x):
-    x = fluid.dygraph.to_variable(x)
+    x = paddle.to_tensor(x)
     y = numpy.ones(5)
     y_shape_0 = y.shape[0]
     i = 1
@@ -337,6 +344,18 @@ def _set_expected_op_num(self):
         self.expected_slice_op_num = 2
 
 
+class TestTupleShape3(TestTensorShapeBasic):
+    def init_test_func(self):
+        self.input = numpy.ones((5, 7)).astype("int32")
+        self.input_spec = [paddle.static.InputSpec(shape=[5, 7], dtype="int32")]
+        self.dygraph_func = dyfunc_tuple_shape_3
+
+    def _set_expected_op_num(self):
+        self.expected_op_num = 5
+        self.expected_shape_op_num = 1
+        self.expected_slice_op_num = 2
+
+
 class TestPaddleShapeApi(TestTensorShapeBasic):
     def init_test_func(self):
         self.input = numpy.ones((5, 7)).astype("int32")
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
index 349d5f82dbf545..23dae317386918 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
@@ -212,7 +212,7 @@ def test_row_parallel_layer(self):
             optimizer_b.step()
 
             np.testing.assert_allclose(
-                loss_a.numpy(), loss_b.numpy(), rtol=1e-5)
+                loss_a.numpy(), loss_b.numpy(), rtol=5e-6)
 
     def test_parallel_embedding(self):
         batch_size = 17
@@ -265,9 +265,67 @@ def test_parallel_embedding(self):
 
             optimizer_a.step()
             optimizer_b.step()
+            print(loss_a.numpy(), loss_b.numpy())
+
+            np.testing.assert_allclose(loss_a.numpy(), loss_b.numpy())
+
+    def test_parallel_cross_entropy(self):
+        batch_size = 2
+        seq_length = 1
+        class_size_per_card = 2
+        vocab_size = class_size_per_card * self.model_parallel_size
+        seed = 1025
+
+        set_random_seed(seed)
+        rank_id = dist.get_rank()
+
+        # model_a
+        model_a = fleet.meta_parallel.ParallelCrossEntropy()
+
+        model_b = paddle.nn.CrossEntropyLoss(reduction="none")
+
+        paddle.seed(rank_id * 10)
+        random.seed(seed)
+        np.random.seed(seed)
+
+        for _ in range(5):
+            np_label = np.random.randint(0, vocab_size,
+                                         (batch_size, seq_length))
+            label = paddle.to_tensor(np_label, dtype="int64")
+
+            data = paddle.randn(
+                shape=[batch_size, seq_length, class_size_per_card],
+                dtype='float32')
+            data.stop_gradient = False
+
+            check_group = dist.new_group(list(range(self.model_parallel_size)))
+            integral_data = []
+            partial_data = data.clone().detach()
+            paddle.distributed.all_gather(
+                integral_data, partial_data, group=check_group)
+            integral_data = paddle.concat(integral_data, axis=-1)
+            integral_data = integral_data.detach().clone()
+            integral_data.stop_gradient = False
+
+            loss_a = model_a(data, label).sum() / batch_size
+            loss_b = model_b(integral_data, label).sum() / batch_size
+            print("loss_a: ", loss_a.numpy(), "loss_b: ", loss_b.numpy())
+
             np.testing.assert_allclose(
                 loss_a.numpy(), loss_b.numpy(), rtol=1e-6)
 
+            loss_a.backward()
+            loss_b.backward()
+
+            integral_grad = []
+            partial_grad = data.grad.clone().detach()
+            paddle.distributed.all_gather(
+                integral_grad, partial_grad, group=check_group)
+            integral_grad = paddle.concat(integral_grad, axis=-1)
+
+            np.testing.assert_allclose(
+                integral_data.grad.numpy(), integral_grad.numpy(), rtol=1e-6)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
index a9f251f3079cef..f9ec49d88172a6 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
@@ -32,14 +32,36 @@ def set_random_seed(seed, dp_id, rank_id):
     paddle.seed(seed + rank_id)
 
 
-vocab_size = 5
+vocab_size = 20
 hidden_size = 10
 inner_size = 8
-output_size = 2
+output_size = 10
 seq_length = 2
 batch_size = 4
 
 
+def parallel_matmul(lm_output, logit_weights, parallel_output):
+    hcg = fleet.get_hybrid_communicate_group()
+    model_parallel_group = hcg.get_model_parallel_group()
+    world_size = hcg.get_model_parallel_world_size()
+    rank = hcg.get_model_parallel_rank()
+
+    if world_size > 1:
+        input_parallel = paddle.distributed.collective._c_identity(
+            lm_output, group=model_parallel_group)
+
+        logits = paddle.matmul(input_parallel, logit_weights, transpose_y=True)
+
+        if parallel_output:
+            return logits
+
+        return paddle.distributed.collective._c_concat(
+            logits, group=model_parallel_group)
+    else:
+        logits = paddle.matmul(lm_output, logit_weights, transpose_y=True)
+        return logits
+
+
 class SimpleMPNet(fluid.dygraph.Layer):
     def __init__(self, vocab_size, hidden_size, inner_size, output_size, np_fc1,
                  np_fc2, mp_id):
@@ -86,6 +108,7 @@ def forward(self, x):
         x = self.linear1(x)
         x = self.linear2(x)
         x = self.linear3(x)
+        x = parallel_matmul(x, self.embedding.weight, False)
         return x
 
 
@@ -128,6 +151,7 @@ def forward(self, x):
         x = self.linear1(x)
         x = self.linear2(x)
         x = self.linear3(x)
+        x = paddle.matmul(x, self.embedding.weight, transpose_y=True)
         return x
 
 
@@ -192,7 +216,7 @@ def test_mp_model(self):
             loss_b = self.train_batch(batch, model_b, optimizer_b, False)
 
             np.testing.assert_allclose(
-                loss_a.numpy(), loss_b.numpy(), rtol=1e-5)
+                loss_a.numpy(), loss_b.numpy(), rtol=1e-6)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
index 010086bfbbc47f..e3c21eaa78d716 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
@@ -160,7 +160,8 @@ def check_output_with_option(self,
                                  use_gpu,
                                  atol=1e-5,
                                  flatten=False,
-                                 quant=False):
+                                 quant=False,
+                                 rtol=1e-5):
         '''
         Check whether calculating on CPU and GPU, enable TensorRT 
         or disable TensorRT, enable MKLDNN or disable MKLDNN 
@@ -260,7 +261,7 @@ def check_output_with_option(self,
 
                 self.assertTrue(
                     np.allclose(
-                        out, tensorrt_output, atol=atol),
+                        out, tensorrt_output, rtol=rtol, atol=atol),
                     "Output has diff between GPU and TensorRT. ")
 
         # Check whether the mkldnn results and the CPU results are the same. 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
index 7f613c4765963d..ebbf724d0b4ead 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
@@ -36,6 +36,7 @@ def setUp(self):
                 groups=self.conv_groups,
                 padding=self.conv_padding,
                 bias_attr=False,
+                use_cudnn=self.use_cudnn,
                 act=None)
         self.feeds = {
             "data": np.random.random([1, 6, 64, 64]).astype("float32"),
@@ -50,6 +51,7 @@ def set_params(self):
         self.conv_filter_size = 6
         self.conv_groups = 3
         self.conv_padding = [1, 1]
+        self.use_cudnn = True
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -65,6 +67,7 @@ def set_params(self):
         self.conv_filter_size = 6
         self.conv_groups = 3
         self.conv_padding = 'VALID'
+        self.use_cudnn = True
 
 
 class TensorRTSubgraphPassConvSamePaddingTest(InferencePassTest):
@@ -73,6 +76,7 @@ def set_params(self):
         self.conv_filter_size = 6
         self.conv_groups = 3
         self.conv_padding = 'SAME'
+        self.use_cudnn = True
 
 
 class TensorRTSubgraphPassDepthwiseConvTest(TensorRTSubgraphPassConvTest):
@@ -81,6 +85,16 @@ def set_params(self):
         self.conv_filter_size = 6
         self.conv_groups = 6
         self.conv_padding = [1, 1]
+        self.use_cudnn = False
+
+
+class TensorRTSubgraphPassDepthwiseConv2Test(TensorRTSubgraphPassConvTest):
+    def set_params(self):
+        self.conv_num_filters = 12
+        self.conv_filter_size = 6
+        self.conv_groups = 6
+        self.conv_padding = [1, 1]
+        self.use_cudnn = False
 
 
 class TensorRTSubgraphPassConvTransposeTest(InferencePassTest):
@@ -151,6 +165,16 @@ def set_params(self):
         self.use_cudnn = True
 
 
+class TensorRTSubgraphPassConvTranspose2Test(
+        TensorRTSubgraphPassConvTransposeTest):
+    def set_params(self):
+        self.conv_num_filters = 12
+        self.conv_filter_size = 4
+        self.conv_groups = 6
+        self.conv_padding = [1, 1]
+        self.use_cudnn = False
+
+
 class TensorRTSubgraphPassDepthwiseConvTransposeTest(
         TensorRTSubgraphPassConvTransposeTest):
     def set_params(self):
@@ -195,7 +219,7 @@ def setUp(self):
             }, {
                 "conv2d_0.tmp_0": [16, 6, 16, 16],
                 "data": [16, 6, 16, 16],
-                "depthwise_conv2d_0.tmp_0": [32, 6, 64, 64]
+                "depthwise_conv2d_0.tmp_0": [16, 6, 16, 16]
             }, False)
         self.fetch_list = [conv_out]
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
new file mode 100644
index 00000000000000..75f5328ac1c419
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTGatherNdTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[-1, 3, 4], dtype="float32")
+            index = fluid.data(name="index", shape=[-1, 2, 2], dtype="int32")
+            gather_nd = fluid.layers.gather_nd(data, index)
+            out = fluid.layers.batch_norm(gather_nd, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([2, 3, 4]).astype("float32"),
+            "index":
+            np.array([[[0, 1], [1, 0]], [[1, 2], [0, 1]]]).astype("int32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTGatherNdTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+        self.dynamic_shape_params = TRTGatherNdTest.DynamicShapeParam({
+            'data': [1, 3, 4],
+            'index': [1, 2, 2]
+        }, {'data': [3, 3, 4],
+            'index': [3, 2, 2]}, {'data': [3, 3, 4],
+                                  'index': [3, 2, 2]}, False)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTGatherNdFp16Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 5120, 768], dtype="float32")
+            index = fluid.data(name="index", shape=[-1, 4096, 2], dtype="int32")
+            gather_nd = fluid.layers.gather_nd(data, index)
+            out = fluid.layers.batch_norm(gather_nd, is_test=True)
+
+        index_data = np.zeros((1, 4096, 2), dtype='int32')
+        self.feeds = {
+            "data": np.random.random([1, 5120, 768]).astype("float32"),
+            "index": index_data,
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTGatherNdFp16Test.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False)
+        self.fetch_list = [out]
+        self.dynamic_shape_params = TRTGatherNdFp16Test.DynamicShapeParam({
+            'data': [1, 5120, 768],
+            'index': [1, 4096, 2]
+        }, {'data': [3, 5120, 768],
+            'index':
+            [3, 4096, 2]}, {'data': [3, 5120, 768],
+                            'index': [3, 4096, 2]}, False)
+
+    def test_check_output(self, atol=1e-3):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
new file mode 100644
index 00000000000000..bb5e8e99b09269
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTReduceSumTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 10, 768], dtype="float32")
+            reduce_sum = fluid.layers.reduce_sum(
+                data, dim=[2, -1], keep_dim=True)
+            out = fluid.layers.batch_norm(reduce_sum, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([3, 3, 10, 768]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReduceSumTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+        self.dynamic_shape_params = TRTReduceSumTest.DynamicShapeParam({
+            'data': [1, 3, 8, 8]
+        }, {'data': [3, 3, 10, 768]}, {'data': [3, 3, 10, 768]}, False)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTReduceSumAllTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 10, 768], dtype="float32")
+            reduce_sum = fluid.layers.reduce_sum(data, keep_dim=True)
+            out = fluid.layers.batch_norm(reduce_sum, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([3, 3, 10, 768]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReduceSumAllTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+        self.dynamic_shape_params = TRTReduceSumAllTest.DynamicShapeParam({
+            'data': [1, 3, 8, 8]
+        }, {'data': [3, 3, 10, 768]}, {'data': [3, 3, 10, 768]}, False)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
index d895ac44d89319..0406e03f54bd4c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py
@@ -366,6 +366,61 @@ def test_check_output(self):
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
 
 
+class TensorRTSubgraphPassLayerNormDynamicTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 64, 64], dtype="float32")
+            out = fluid.layers.layer_norm(
+                data, begin_norm_axis=self.begin_norm_axis)
+        self.feeds = {
+            "data": np.random.random([1, 3, 64, 64]).astype("float32"),
+        }
+        self.set_trt_params()
+        self.fetch_list = [out]
+
+    def set_trt_params(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassLayerNormDynamicTest.TensorRTParam(
+            1 << 30, 32, 0, self.precision, self.serialize, False)
+        self.dynamic_shape_params = TensorRTSubgraphPassLayerNormDynamicTest.DynamicShapeParam(
+            {
+                'data': [1, 3, 64, 64],
+            }, {'data': [8, 8, 64, 64], }, {'data': [4, 4, 64, 64], }, False)
+
+    def set_params(self):
+        self.begin_norm_axis = 2
+        self.precision = AnalysisConfig.Precision.Float32
+        self.serialize = True
+
+    def test_check_output(self):
+        if os.path.exists(self.path + "_opt_cache"):
+            shutil.rmtree(self.path + "_opt_cache")
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TensorRTSubgraphPassLayerNormDynamicFP16Test(
+        TensorRTSubgraphPassLayerNormDynamicTest):
+    def set_params(self):
+        self.begin_norm_axis = 2
+        self.precision = AnalysisConfig.Precision.Half
+        self.serialize = True
+
+    def test_check_output(self):
+        if os.path.exists(self.path + "_opt_cache"):
+            shutil.rmtree(self.path + "_opt_cache")
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, atol=0.01, rtol=0.01)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
 class TensorRTSubgraphPassLayerNormBeginNormAxis2Test(
         TensorRTSubgraphPassLayerNormTest):
     def set_params(self):
diff --git a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
index 0e4fd8f69dcd3f..ea125ccf3fc6c0 100644
--- a/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
+++ b/python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 import os
-
+import sys
 import six
 import unittest
 import time
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
index c93201946b2757..90614ccb3bc154 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
@@ -19,18 +19,19 @@
 import paddle.fluid as fluid
 import os
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import _global_flags
 
 
 def check():
-    print("check: fluid.core.globals()['FLAGS_use_mkldnn']=",
-          fluid.core.globals()["FLAGS_use_mkldnn"])
+    print("check: _global_flags()['FLAGS_use_mkldnn']=",
+          _global_flags()["FLAGS_use_mkldnn"])
     print("check: fluid.get_flags('FLAGS_use_mkldnn')=",
           fluid.get_flags(['FLAGS_use_mkldnn']))
     print("check: DNNL_VERBOSE=", os.environ['DNNL_VERBOSE'])
     print("check: FLAGS_tracer_mkldnn_ops_on=",
-          fluid.core.globals()['FLAGS_tracer_mkldnn_ops_on'])
+          _global_flags()['FLAGS_tracer_mkldnn_ops_on'])
     print("check: FLAGS_tracer_mkldnn_ops_off=",
-          fluid.core.globals()['FLAGS_tracer_mkldnn_ops_off'])
+          _global_flags()['FLAGS_tracer_mkldnn_ops_off'])
     a_np = np.random.uniform(-2, 2, (10, 20, 30)).astype(np.float32)
     b_np = np.random.uniform(-5, 5, (10, 20, 30)).astype(np.float32)
     helper = LayerHelper(fluid.unique_name.generate(str("test")), act="relu")
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py
index 8f5715a0d0afcf..3d9ef39680dc05 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_use_mkldnn.py
@@ -19,11 +19,12 @@
 import paddle.fluid as fluid
 import os
 from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import _global_flags
 
 
 def check():
-    print("check: fluid.core.globals()['FLAGS_use_mkldnn']=",
-          fluid.core.globals()["FLAGS_use_mkldnn"])
+    print("check: _global_flags()['FLAGS_use_mkldnn']=",
+          _global_flags()["FLAGS_use_mkldnn"])
     print("check: fluid.get_flags('FLAGS_use_mkldnn')=",
           fluid.get_flags(['FLAGS_use_mkldnn']))
     print("check: DNNL_VERBOSE=", os.environ['DNNL_VERBOSE'])
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
index f31ddf921f819c..b473d2643d3788 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
@@ -18,6 +18,7 @@
 import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle import enable_static
 
 from paddle.fluid.tests.unittests.test_conv2d_transpose_op import conv2dtranspose_forward_naive, TestConv2DTransposeOp
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
index 46bdbb1a420af2..d65919aa434c38 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
@@ -32,7 +32,8 @@ def set_confs(self):
     def test_check_output(self):
         for use_seq in {True, False}:
             self.attrs['use_seq'] = use_seq
-            self.check_output(check_dygraph=False, no_check_set=["Cell"])
+            self.check_output(
+                check_dygraph=False, no_check_set=["Cell"], atol=2e-2)
 
     def setUp(self):
         self.op_type = 'fusion_lstm'
diff --git a/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py
new file mode 100644
index 00000000000000..37fa5f8cad2abe
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py
@@ -0,0 +1,110 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+
+paddle.enable_static()
+SEED = 2021
+alignment = 512
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAllocContinuousSpace(OpTest):
+    def setUp(self):
+        self.__class__.use_npu = True
+        self.op_type = "coalesce_tensor"
+        self.dtype, self.fluid_dtype = self.init_dtype()
+        attrs = self.init_attr()
+        self.copy_data = attrs["copy_data"]
+        self.constant = attrs["constant"]
+        self.set_constant = attrs["set_constant"]
+        self.Inputs = self.init_input()
+        self.Outputs, self.FusedOutput = self.init_output(
+            self.Inputs, self.set_constant, self.constant)
+        self.inputs = {'Input': self.Inputs}
+        self.attrs = attrs
+        self.outputs = {'Output': self.Outputs, 'FusedOutput': self.FusedOutput}
+
+    def init_dtype(self):
+        return np.float32, int(core.VarDesc.VarType.FP32)
+
+    def init_input(self):
+        inputs = []
+        inputs.append(("x1", np.zeros([20, 3]).astype(self.dtype)))
+        inputs.append(("x2", np.zeros([20, 3]).astype(self.dtype)))
+        return inputs
+
+    def init_attr(self):
+        return {
+            "copy_data": False,
+            "set_constant": False,
+            "constant": 0.0,
+            "use_align": True,
+            "dtype": self.fluid_dtype
+        }
+
+    def init_output(self, input_list, set_constant, constant):
+        inputs = []
+        outputs = input_list
+
+        for input in input_list:
+            length = len(input[1].flatten())
+            aligned_len = (length + alignment) / alignment * alignment
+            out = np.zeros(int(aligned_len), dtype=self.dtype)
+            out[0:length] = input[1].flatten()
+            inputs.append(out)
+
+        coalesce_tensor_var = np.concatenate([input for input in inputs])
+        return outputs, coalesce_tensor_var
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            place=paddle.NPUPlace(0),
+            no_check_set=["FusedOutput"],
+            atol=1e-5,
+            check_dygraph=False)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAllocContinuousSpace2(TestAllocContinuousSpace):
+    def init_attr(self):
+        return {
+            "copy_data": True,
+            "set_constant": False,
+            "constant": 0.5,
+            "use_align": True,
+            "dtype": self.fluid_dtype
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            place=paddle.NPUPlace(0),
+            no_check_set=["FusedOutput"],
+            atol=1e-5,
+            check_dygraph=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 654723d8629900..9bf4d09cc36c35 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -1191,7 +1191,9 @@ def find_actual(target_name, fetch_list):
                         np.float32, np.float64
                 ]:
                     actual_t = convert_uint16_to_float(actual_t)
-                    atol = max(atol, 0.03)
+                    rtol = 1.e-2
+                else:
+                    rtol = 1.e-5
 
                 if expect_t.dtype == np.uint16 and actual_t.dtype == np.uint16:
                     expect_t = convert_uint16_to_float(expect_t)
@@ -1204,7 +1206,11 @@ def find_actual(target_name, fetch_list):
 
                 self.assertTrue(
                     np.allclose(
-                        actual_t, expect_t, atol=atol, equal_nan=equal_nan),
+                        actual_t,
+                        expect_t,
+                        rtol=rtol,
+                        atol=atol,
+                        equal_nan=equal_nan),
                     "Output (" + out_name + ") has diff at " + str(place) +
                     "\nExpect " + str(expect_t) + "\n" + "But Got" +
                     str(actual_t) + " in class " + self.__class__.__name__)
diff --git a/python/paddle/fluid/tests/unittests/parallel_embedding_api.py b/python/paddle/fluid/tests/unittests/parallel_embedding_api.py
index 7460577403fb12..8907adbf46a971 100644
--- a/python/paddle/fluid/tests/unittests/parallel_embedding_api.py
+++ b/python/paddle/fluid/tests/unittests/parallel_embedding_api.py
@@ -48,23 +48,27 @@ def get_model(self, main_prog, startup_program, rank):
         with fluid.program_guard(main_prog, startup_program):
             fleet.init(is_collective=True)
             np.random.seed(2020)
-            np_array = np.random.rand(10, 8)
+            # (num_embeddings, embedding_dim) = (12, 8)
+            size = (12, 8)
+            np_array = np.random.rand(size[0], size[1])
             paddle.seed(2020)
-            data_in = paddle.randint(0, 8, shape=(10, 4))
+            data_in = paddle.randint(0, size[0], shape=(10, 4))
 
             data = paddle.static.data(
                 name='tindata', shape=[10, 1000], dtype="float32")
+            per_part_size = size[0] // 2
             if rank == 0:
                 param_attr = paddle.fluid.ParamAttr(
                     initializer=paddle.fluid.initializer.NumpyArrayInitializer(
-                        np_array[0:5, :]), )
+                        np_array[0:per_part_size, :]), )
             else:
                 param_attr = paddle.fluid.ParamAttr(
                     initializer=paddle.fluid.initializer.NumpyArrayInitializer(
-                        np_array[5:10, :]), )
+                        np_array[per_part_size:size[0], :]), )
 
             emb_out = paddle.distributed.split(
-                data_in, (8, 8),
+                data_in,
+                size,
                 operation="embedding",
                 num_partitions=2,
                 weight_attr=param_attr)
diff --git a/python/paddle/fluid/tests/unittests/parallel_embedding_api_none_divisible.py b/python/paddle/fluid/tests/unittests/parallel_embedding_api_none_divisible.py
deleted file mode 100644
index 75b966fdc57272..00000000000000
--- a/python/paddle/fluid/tests/unittests/parallel_embedding_api_none_divisible.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import argparse
-import os
-import sys
-import signal
-import time
-import socket
-from contextlib import closing
-from six import string_types
-import math
-import paddle
-import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-import paddle.fluid.unique_name as nameGen
-from paddle.fluid import core
-import paddle.distributed.fleet as fleet
-from paddle.fluid.incubate.fleet.base import role_maker
-import unittest
-from multiprocessing import Process
-import paddle.fluid.layers as layers
-from functools import reduce
-from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
-
-paddle.enable_static()
-
-
-class TestParallelEmbeddingAPINoneDivisible(TestCollectiveAPIRunnerBase):
-    def __init__(self):
-        self.global_ring_id = 0
-
-    def get_model(self, main_prog, startup_program, rank):
-        with fluid.program_guard(main_prog, startup_program):
-            fleet.init(is_collective=True)
-            np.random.seed(2020)
-            np_array = np.random.rand(9, 8)
-            paddle.seed(2020)
-            data_in = paddle.randint(0, 7, shape=(10, 4))
-
-            data = paddle.static.data(
-                name='tindata', shape=[10, 1000], dtype="float32")
-            if rank == 0:
-                param_attr = paddle.fluid.ParamAttr(
-                    initializer=paddle.fluid.initializer.NumpyArrayInitializer(
-                        np_array[0:5, :]), )
-            else:
-                param_attr = paddle.fluid.ParamAttr(
-                    initializer=paddle.fluid.initializer.NumpyArrayInitializer(
-                        np_array[5:9, :]), )
-
-            emb_out = paddle.distributed.split(
-                data_in, (7, 8),
-                operation="embedding",
-                num_partitions=2,
-                weight_attr=param_attr)
-
-            return [data_in, emb_out]
-
-
-if __name__ == "__main__":
-    runtime_main(TestParallelEmbeddingAPINoneDivisible, "parallel_embedding")
diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py b/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py
new file mode 100644
index 00000000000000..7211bd3e92f790
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/pipeline_mnist_multi_device.py
@@ -0,0 +1,159 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu",
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu",
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
+
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    with fluid.device_guard("gpu:1"):
+        predict = fluid.layers.fc(
+            input=conv_pool_2,
+            size=SIZE,
+            act="softmax",
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01)))
+        # To cover @RENAMED@GRADIENT
+        predict2 = fluid.layers.fc(
+            input=conv_pool_1,
+            size=SIZE,
+            act="softmax",
+            param_attr=fluid.param_attr.ParamAttr(
+                initializer=fluid.initializer.Constant(value=0.01)))
+        predict += predict2
+    return predict
+
+
+class TestDistMnist2x2(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        with fluid.device_guard("gpu:0"):
+            images = fluid.layers.data(
+                name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+            if dist_strategy:
+                data_loader = fluid.io.DataLoader.from_generator(
+                    feed_list=[images, label],
+                    capacity=64,
+                    use_double_buffer=False,
+                    iterable=False)
+            # Train program
+            predict = cnn_model(images)
+        with fluid.device_guard("gpu:1"):
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+
+        # Evaluator
+        with fluid.device_guard("gpu:1"):
+            batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+            batch_acc = fluid.layers.accuracy(
+                input=predict, label=label, total=batch_size_tensor)
+
+        inference_program = fluid.default_main_program().clone()
+        base_lr = self.lr
+        passes = [30, 60, 80, 90]
+        steps_per_pass = 10
+        bd = [steps_per_pass * p for p in passes]
+        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
+        lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
+        opt = fluid.optimizer.Momentum(
+            learning_rate=lr_val,
+            momentum=0.9,
+            grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
+
+        acc_steps = 2  # accumulated steps for pipeline
+        if dist_strategy:
+            # Reader
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size)
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size)
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.pipeline = True
+            strategy.amp = True
+            strategy.pipeline_configs = {
+                'micro_batch_size': batch_size,
+                'schedule_mode': 'F-then-B',
+                'accumulate_steps': acc_steps
+            }
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+            # Reader
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size * acc_steps)
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=batch_size * acc_steps)
+
+        if dist_strategy:
+            return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader
+        else:
+            return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+if __name__ == "__main__":
+    runtime_main(TestDistMnist2x2)
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index aea2a074aedd58..715e66e563337f 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -804,11 +804,48 @@ def test_adam_save_load(self):
         adam.minimize(b)
         state_dict = adam.state_dict()
         fluid.save_dygraph(state_dict, "paddle_dy")
-        para_state_dict, opti_state_dict = fluid.load_dygraph("paddle_dy")
-        adam.set_state_dict(opti_state_dict)
+        para_state_dict, opt_state_dict = fluid.load_dygraph("paddle_dy")
+        adam.set_state_dict(opt_state_dict)
 
         paddle.enable_static()
 
+    def test_adam_save_load_error(self):
+        paddle.disable_static()
+
+        def get_opt(dtype, shape):
+            with paddle.utils.unique_name.guard():
+                paddle.set_default_dtype(dtype)
+                a = paddle.rand([4, 10])
+                linear = paddle.nn.Linear(10, 10)
+                b = linear(a)
+                state_dict = linear.state_dict()
+                fluid.save_dygraph(state_dict, "paddle_dy")
+
+                scheduler = paddle.optimizer.lr.NoamDecay(
+                    d_model=0.01, warmup_steps=100, verbose=True)
+                adam = paddle.fluid.optimizer.Adam(
+                    learning_rate=scheduler,
+                    parameter_list=linear.parameters(),
+                    use_global_beta_pow=True)
+                adam.minimize(b)
+                return adam
+
+        adam = get_opt('float32', [10, 10])
+
+        state_dict = adam.state_dict()
+        fluid.save_dygraph(state_dict, "paddle_dy")
+        para_state_dict, opt_state_dict = fluid.load_dygraph("paddle_dy")
+        adam.set_state_dict(opt_state_dict)
+
+        adam2 = get_opt('float64', [10, 10])  # dtype not match
+        self.assertRaises(AssertionError, adam2.set_state_dict, opt_state_dict)
+
+        adam3 = get_opt('float32', [10, 10])  # shape not match
+        opt_state_dict['beta1_pow_acc_0'] = np.array(
+            [0.9, 0.9], dtype='float32')
+        self.assertRaises(AssertionError, adam3.set_state_dict, opt_state_dict)
+        paddle.enable_static()
+
 
 class TestAdamOpV2Group(TestAdamOpV2):
     def test_adam_op(self):
diff --git a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
index 3f33120d1f79f0..3faf7f6862058d 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
@@ -268,7 +268,7 @@ def test_not_use(self):
     def test_checker(self):
         os.environ.pop("PADDLE_JOB_ID", None)
         try:
-            checker = AutoCheckpointChecker()
+            checker = acp.AutoCheckpointChecker()
             self.assertFalse(True)
         except Exception as e:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_c_embedding_op.py b/python/paddle/fluid/tests/unittests/test_c_embedding_op.py
new file mode 100644
index 00000000000000..c0cae78ed29538
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_c_embedding_op.py
@@ -0,0 +1,59 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.framework import core
+
+
+def get_c_embedding(start, end, table, ids):
+    index = ids.flatten()
+    input_mask = (index < start) | (index >= end)
+    masked_input = index - start
+    masked_input[input_mask] = 0
+    output = table[masked_input]
+    output[input_mask] = 0.0
+    return output
+
+
+class TestCEmbeddingOp(OpTest):
+    def setUp(self):
+        self.op_type = "c_embedding"
+        table = np.random.random((17, 31)).astype("float64")
+        ids = np.random.randint(
+            low=0, high=17 * 2, size=(2, 4, 5)).astype("int32")
+        self.start_index = 10
+        self.end_index = self.start_index + 17
+
+        self.inputs = {'W': table, 'Ids': ids}
+        np_out = get_c_embedding(self.start_index, self.end_index, table, ids)
+        self.outputs = {'Out': np_out.reshape((2, 4, 5, 31))}
+        self.attrs = {'start_index': self.start_index}
+
+    def test_check_output_gpu(self):
+        if core.is_compiled_with_cuda():
+            self.check_output_with_place(core.CUDAPlace(0))
+
+    def test_check_grad_gpu(self):
+        if core.is_compiled_with_cuda():
+            self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index f0c042eb7e95b6..81d246d35b8bba 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -257,11 +257,10 @@ def check_with_place(self,
         elif col_type == "parallel_embedding":
             result_data = tr0_out[0]
             np.random.seed(2020)
-            need_result = np.random.rand(10, 8)
+            need_result = np.random.rand(12, 8)
             for i in range(result_data.shape[0]):
                 for j in range(result_data.shape[1]):
                     data = result_data[i][j]
-                    if data >= 4: data += 1
                     assert np.allclose(
                         tr0_out[1][i][j], need_result[data], atol=1e-08)
         elif col_type == "row_parallel_linear":
diff --git a/python/paddle/fluid/tests/unittests/test_collective_split_embedding_none_divisible.py b/python/paddle/fluid/tests/unittests/test_collective_split_embedding_none_divisible.py
index fc9775b3566b11..955adf08c48241 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_split_embedding_none_divisible.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_split_embedding_none_divisible.py
@@ -16,20 +16,24 @@
 import unittest
 import numpy as np
 import paddle
-
-from test_collective_api_base import TestDistBase
+from paddle.distributed import fleet
 
 paddle.enable_static()
 
 
-class TestParallelEmbeddingNoneDivisibleAPI(TestDistBase):
-    def _setup_config(self):
-        pass
+class TestCollectiveSplitAssert(unittest.TestCase):
+    def network(self):
+        fleet.init()
+        data = paddle.static.data(
+            name='tindata', shape=[10, 1000], dtype="float32")
+        emb_out = paddle.distributed.split(
+            data, (7, 8), operation="embedding", num_partitions=2)
 
-    def test_parallel_embedding_none_divisible(self):
-        self.check_with_place("parallel_embedding_api_none_divisible.py",
-                              "parallel_embedding", "nccl")
+    def test_assert(self):
+        with self.assertRaises(AssertionError):
+            self.network()
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index ea59e070cbd51d..d9c6406422277c 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -28,6 +28,8 @@
 import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.distributed.fleet as fleet
 
+from paddle.distributed.utils import find_free_ports
+
 paddle.enable_static()
 
 
@@ -101,12 +103,9 @@ def run_ut(self):
 
         os.environ["PADDLE_PSERVER_NUMS"] = "1"
         os.environ["PADDLE_TRAINERS_NUM"] = "1"
-        os.environ["POD_IP"] = "127.0.0.1"
-        os.environ["PADDLE_PORT"] = "36001"
         os.environ["PADDLE_TRAINER_ID"] = "0"
         os.environ["PADDLE_TRAINERS_NUM"] = "1"
-        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
-            "127.0.0.1:36001"
+        os.environ["POD_IP"] = "127.0.0.1"
 
         role = role_maker.PaddleCloudRoleMaker()
 
@@ -150,8 +149,6 @@ def runTest(self):
         pass
 
 os.environ["TRAINING_ROLE"] = "PSERVER"
-os.environ["http_proxy"] = ""
-os.environ["https_proxy"] = ""
 
 half_run_server = RunServer()
 half_run_server.run_ut()
@@ -160,9 +157,12 @@ def runTest(self):
         server_file = "run_server_for_communicator_geo.py"
         with open(server_file, "w") as wb:
             wb.write(run_server_cmd)
+
+        port = find_free_ports(1).pop()
+
         os.environ["TRAINING_ROLE"] = "PSERVER"
-        os.environ["http_proxy"] = ""
-        os.environ["https_proxy"] = ""
+        os.environ["PADDLE_PORT"] = str(port)
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:{}".format(port)
 
         _python = sys.executable
 
@@ -173,17 +173,14 @@ def runTest(self):
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE)
 
-        outs, errs = ps_proc.communicate(timeout=15)
-
-        time.sleep(1)
+        time.sleep(5)
 
         os.environ["TRAINING_ROLE"] = "TRAINER"
-        os.environ["http_proxy"] = ""
-        os.environ["https_proxy"] = ""
 
         self.run_ut()
         ps_proc.kill()
         ps_proc.wait()
+        outs, errs = ps_proc.communicate()
 
         if os.path.exists(server_file):
             os.remove(server_file)
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index e55997c229e217..db05801c7227b0 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -20,7 +20,8 @@
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient
+from paddle.fluid.tests.unittests.testsuite import create_op
 from paddle.fluid import Program, program_guard
 
 
@@ -167,6 +168,52 @@ def test_check_grad_no_input(self):
     globals()[cls_name] = TestConv2DCUDNNFp16
 
 
+def create_test_cudnn_bf16_class(parent):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
+        "core is not compiled with CUDA and cudnn version need larger than 8.1.0"
+    )
+    class TestConv2DCUDNNBF16(parent):
+        def get_numeric_grad(self, place, check_name):
+            scope = core.Scope()
+            self._check_grad_helper()
+            op = create_op(scope, self.op_type, self.inputs, self.outputs,
+                           self.attrs)
+            return get_numeric_gradient(place, scope, op, self.inputs_fp32,
+                                        check_name, ['Output'])
+
+        def init_kernel_type(self):
+            self.use_cudnn = True
+            self.no_need_check_grad = True
+            self.dtype = np.uint16
+
+        def test_check_output(self):
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-2)
+
+        def test_check_grad_no_filter(self):
+            place = core.CUDAPlace(0)
+            numeric_grads = self.get_numeric_grad(place, 'Input')
+            self.check_grad_with_place(
+                place, ['Input'],
+                'Output',
+                no_grad_set=set(['Filter']),
+                user_defined_grads=[numeric_grads])
+
+        def test_check_grad_no_input(self):
+            place = core.CUDAPlace(0)
+            numeric_grads = self.get_numeric_grad(place, 'Filter')
+            self.check_grad_with_place(
+                place, ['Filter'],
+                'Output',
+                no_grad_set=set(['Input']),
+                user_defined_grads=[numeric_grads])
+
+    cls_name = "{0}_{1}".format(parent.__name__, "CUDNNBF16")
+    TestConv2DCUDNNBF16.__name__ = cls_name
+    globals()[cls_name] = TestConv2DCUDNNBF16
+
+
 def create_test_channel_last_class(parent):
     class TestChannelLastCase(parent):
         def init_data_format(self):
@@ -319,7 +366,15 @@ def setUp(self):
             'dilation': self.dilations
         }
 
-        input = np.random.random(self.input_size).astype(self.dtype)
+        if self.is_bfloat16_op():
+            input = np.random.random(self.input_size).astype(np.float32)
+            filter = np.random.uniform(-1, 1,
+                                       self.filter_size).astype(np.float32)
+        else:
+            input = np.random.random(self.input_size).astype(self.dtype)
+            filter = np.random.uniform(-1, 1,
+                                       self.filter_size).astype(self.dtype)
+
         if not self.has_cuda():
             self.fuse_relu_before_depthwise_conv = False
         if self.fuse_relu_before_depthwise_conv:
@@ -329,16 +384,27 @@ def setUp(self):
             input2 = np.maximum(input, 0.0)
         else:
             input2 = input
-        filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
 
         output, _, _, _, _ = conv2d_forward_naive(input2, filter, self.groups,
                                                   conv2d_param)
-        output = output.astype(self.dtype)
 
-        self.inputs = {
-            'Input': OpTest.np_dtype_to_fluid_dtype(input),
-            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
-        }
+        if self.is_bfloat16_op():
+            output = output.astype(np.float32)
+            self.inputs = {
+                'Input': convert_float_to_uint16(input),
+                'Filter': convert_float_to_uint16(filter)
+            }
+            self.inputs_fp32 = {
+                'Input': OpTest.np_dtype_to_fluid_dtype(input),
+                'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
+            }
+        else:
+            output = output.astype(self.dtype)
+            self.inputs = {
+                'Input': OpTest.np_dtype_to_fluid_dtype(input),
+                'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
+            }
+
         self.attrs = {
             'strides': self.stride,
             'paddings': self.pad,
@@ -554,6 +620,15 @@ def init_group(self):
 create_test_cudnn_fp16_class(TestWith1x1, grad_check=False)
 create_test_cudnn_fp16_class(TestWithInput1x1Filter1x1, grad_check=False)
 
+#----------------Conv2DCUDNN bf16----------------
+
+create_test_cudnn_bf16_class(TestConv2DOp)
+create_test_cudnn_bf16_class(TestWithPad)
+create_test_cudnn_bf16_class(TestWithStride)
+create_test_cudnn_bf16_class(TestWithGroup)
+create_test_cudnn_bf16_class(TestWith1x1)
+create_test_cudnn_bf16_class(TestWithInput1x1Filter1x1)
+
 
 class TestCUDNNExhaustiveSearch(TestConv2DOp):
     def init_kernel_type(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
index b8c498fe4a3c71..08589f0191d8c6 100644
--- a/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataloader_dataset.py
@@ -14,9 +14,12 @@
 
 from __future__ import division
 
+import sys
 import unittest
 import numpy as np
 
+import paddle
+import paddle.vision.transforms as transforms
 import paddle.fluid as fluid
 from paddle.io import *
 
@@ -37,5 +40,48 @@ def test_main(self):
             pass
 
 
+class TestDatasetWithDiffOutputPlace(unittest.TestCase):
+    def get_dataloader(self, num_workers):
+        dataset = paddle.vision.datasets.MNIST(
+            mode='test', transform=transforms.ToTensor())
+        loader = paddle.io.DataLoader(
+            dataset, batch_size=32, num_workers=num_workers, shuffle=True)
+        return loader
+
+    def run_check_on_cpu(self):
+        paddle.set_device('cpu')
+        loader = self.get_dataloader(0)
+        for image, label in loader:
+            self.assertTrue(image.place.is_cpu_place())
+            self.assertTrue(label.place.is_cpu_place())
+            break
+
+    def test_single_process(self):
+        self.run_check_on_cpu()
+        if paddle.is_compiled_with_cuda():
+            # Get (image, label) tuple from MNIST dataset
+            # - the image is on CUDAPlace, label is on CPUPlace
+            paddle.set_device('gpu')
+            loader = self.get_dataloader(0)
+            for image, label in loader:
+                self.assertTrue(image.place.is_gpu_place())
+                self.assertTrue(label.place.is_cuda_pinned_place())
+                break
+
+    def test_multi_process(self):
+        # DataLoader with multi-process mode is not supported on MacOs and Windows currently
+        if sys.platform != 'darwin' and sys.platform != 'win32':
+            self.run_check_on_cpu()
+            if paddle.is_compiled_with_cuda():
+                # Get (image, label) tuple from MNIST dataset
+                # - the image and label are on CPUPlace
+                paddle.set_device('gpu')
+                loader = self.get_dataloader(1)
+                for image, label in loader:
+                    self.assertTrue(image.place.is_cuda_pinned_place())
+                    self.assertTrue(label.place.is_cuda_pinned_place())
+                    break
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_diagflat.py b/python/paddle/fluid/tests/unittests/test_diagflat.py
new file mode 100644
index 00000000000000..ec74855ba25232
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_diagflat.py
@@ -0,0 +1,109 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+from paddle.static import Program, program_guard
+
+
+class TestDiagFlatError(unittest.TestCase):
+    def test_errors(self):
+        paddle.enable_static()
+        with program_guard(Program(), Program()):
+
+            def test_diagflat_type():
+                x = [1, 2, 3]
+                output = paddle.diagflat(x)
+
+            self.assertRaises(TypeError, test_diagflat_type)
+
+            x = paddle.static.data('data', [3, 3])
+            self.assertRaises(TypeError, paddle.diagflat, x, offset=2.5)
+
+
+class TestDiagFlatAPI(unittest.TestCase):
+    def setUp(self):
+        self.input_np = np.random.random(size=(10, 10)).astype(np.float64)
+        self.expected0 = np.diagflat(self.input_np)
+        self.expected1 = np.diagflat(self.input_np, k=1)
+        self.expected2 = np.diagflat(self.input_np, k=-1)
+
+        self.input_np2 = np.random.random(size=(20)).astype(np.float64)
+        self.expected3 = np.diagflat(self.input_np2)
+        self.expected4 = np.diagflat(self.input_np2, k=1)
+        self.expected5 = np.diagflat(self.input_np2, k=-1)
+
+    def run_imperative(self):
+        x = paddle.to_tensor(self.input_np)
+        y = paddle.diagflat(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected0))
+
+        y = paddle.diagflat(x, offset=1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected1))
+
+        y = paddle.diagflat(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected2))
+
+        x = paddle.to_tensor(self.input_np2)
+        y = paddle.diagflat(x)
+        self.assertTrue(np.allclose(y.numpy(), self.expected3))
+
+        y = paddle.diagflat(x, offset=1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected4))
+
+        y = paddle.diagflat(x, offset=-1)
+        self.assertTrue(np.allclose(y.numpy(), self.expected5))
+
+    def run_static(self, use_gpu=False):
+        x = paddle.static.data(name='input', shape=[10, 10], dtype='float64')
+        x2 = paddle.static.data(name='input2', shape=[20], dtype='float64')
+        result0 = paddle.diagflat(x)
+        result3 = paddle.diagflat(x2)
+
+        place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(paddle.static.default_startup_program())
+        res0, res3 = exe.run(
+            feed={"input": self.input_np,
+                  'input2': self.input_np2},
+            fetch_list=[result0, result3])
+
+        self.assertTrue(np.allclose(res0, self.expected0))
+        self.assertTrue(np.allclose(res3, self.expected3))
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with paddle.static.program_guard(Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with paddle.static.program_guard(Program()):
+            self.run_static(use_gpu=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index edc510e4e766d7..78b06bd5333d79 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -186,6 +186,76 @@ def run_pipeline_trainer(self, args):
             fleet.save_inference_model(exe, infer_save_dir_fleet,
                                        feeded_var_names, [avg_cost])
 
+    def run_use_fleet_api_20_trainer(self, args):
+        """
+        1. remove codes for DistributedStrategy and leave the DistributedStrategy part to get_model()
+        2. to run with fleet 2.0 api, set flags _use_fleet_api and _use_fleet_api_20 to True
+        3. for now, not support test for model save
+        """
+        assert args.update_method == "nccl2" or "bkcl"
+
+        self.lr = args.lr
+        print_to_err("use_fleet 2.0", "fleet.node_num:")
+
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
+            self.get_model(batch_size=args.batch_size)
+
+        if fluid.core.is_compiled_with_cuda():
+            device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+            place = fluid.CUDAPlace(device_id)
+        elif fluid.core.is_compiled_with_xpu():
+            device_id = int(os.getenv("FLAGS_selected_xpus", "0"))
+            place = fluid.XPUPlace(device_id)
+        else:
+            raise ValueError(
+                "fleet dygraph api must in paddlepaddle-xpu or paddlepaddle-gpu."
+            )
+
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        eprint(type(self).__name__, "run worker startup program done.")
+
+        feed_var_list = [
+            var
+            for var in fluid.default_main_program().global_block().vars.values()
+            if var.is_data
+        ]
+
+        eprint("feed_var_list:", feed_var_list)
+
+        if feed_var_list[0].name == 'label':
+            feed_var_list = feed_var_list[::-1]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        reader_generator = train_reader()
+
+        def get_data():
+            origin_batch = next(reader_generator)
+            if args.update_method != "local" and args.use_reader_alloc:
+                new_batch = []
+                for offset, item in enumerate(origin_batch):
+                    if offset % 2 == args.trainer_id:
+                        new_batch.append(item)
+                return new_batch
+            else:
+                return origin_batch
+
+        print_to_err(type(self).__name__, "begin to train on trainer")
+        out_losses = []
+        for i in six.moves.xrange(RUN_STEP):
+            loss, = exe.run(fluid.default_main_program(),
+                            fetch_list=[avg_cost.name],
+                            feed=feeder.feed(get_data()))
+            out_losses.append(loss[0])
+            print_to_err(type(self).__name__, "run step %d finished" % i)
+        print_to_err(type(self).__name__, "trainer run finished")
+        print_to_err(type(self).__name__, "dist losses: {}".format(out_losses))
+
+        if six.PY2:
+            print(pickle.dumps(out_losses))
+        else:
+            sys.stdout.buffer.write(pickle.dumps(out_losses))
+
     def run_use_fleet_api_trainer(self, args):
         assert args.update_method == "nccl2" or "bkcl"
 
@@ -630,6 +700,7 @@ def runtime_main(test_class):
     parser.add_argument('--use_hallreduce', action='store_true')
     parser.add_argument('--use_pipeline', action='store_true')
     parser.add_argument('--use_fleet_api', action='store_true')
+    parser.add_argument('--use_fleet_api_20', action='store_true')
     parser.add_argument('--use_local_sgd', action='store_true')
     parser.add_argument('--ut4grad_allreduce', action='store_true')
     parser.add_argument(
@@ -671,6 +742,8 @@ def runtime_main(test_class):
         model.run_pserver(args)
     elif args.use_fleet_api:
         model.run_use_fleet_api_trainer(args)
+    elif args.use_fleet_api_20:
+        model.run_use_fleet_api_20_trainer(args)
     elif args.use_pipeline:
         model.run_pipeline_trainer(args)
     else:
@@ -734,6 +807,7 @@ def setUp(self):
         self._nccl_comm_num = 1
         self._enable_backward_deps = False
         self._use_fleet_api = False
+        self._use_fleet_api_20 = False
         self._use_local_sgd = False
         self._ut4grad_allreduce = False
         self._use_hallreduce = False
@@ -1060,7 +1134,7 @@ def _get_nccl2_trainer_cmd(self, model, ep, update_method, trainer_id,
             tr_cmd += " --fuse_all_reduce {}".format(self._fuse_all_reduce)
 
         if self._use_fleet_api:
-            tr_cmd += " --use_fleet_api"
+            tr_cmd += " --use_fleet_api_20" if self._use_fleet_api_20 else " --use_fleet_api"
             if self._use_local_sgd:
                 tr_cmd += " --use_local_sgd"
             if self._ut4grad_allreduce:
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer.py
new file mode 100644
index 00000000000000..e729bfe0537528
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_dist_base import TestDistBase
+import paddle
+import os
+
+paddle.enable_static()
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestFleetMetaOptimizerPrecision(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl2_mode = True
+        self._nccl2_reduce_layer = True
+        self._use_fleet_api = True
+        self._use_fleet_api_20 = True
+
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "dist_fleet_raw_program_optimizer.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
index 6de04c14bfa708..332603b8129550 100755
--- a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
@@ -92,15 +92,12 @@ def forward(self, inputs):
         return inputs
 
 
-def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
+def run_model(recompute_block=[], recompute_kwargs={}, enable_autocast=False):
     gen = paddle.seed(10)
     gen.manual_seed(10)
     np.random.seed(10)
     random.seed(10)
 
-    if cuda_state:
-        paddle.set_cuda_rng_state(cuda_state)
-
     batch_size, input_size = 1, 10
     model = Naive_fc_net(
         input_size,
@@ -110,19 +107,27 @@ def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
     optimizer = paddle.optimizer.SGD(learning_rate=0.01,
                                      parameters=model.parameters())
 
+    if enable_autocast:
+        scaler = paddle.amp.GradScaler()
+
     loss_ = []
     param_ = []
     grad_ = []
     for step in range(10):
+
         x_data = np.random.randn(batch_size, input_size).astype(np.float32)
         x = paddle.to_tensor(x_data)
         # x.stop_gradient = False
-        y_pred = model(x)
-        loss = y_pred.mean()
-
-        loss_.append(np.asarray(loss).tolist())
-        loss.backward()
-        optimizer.step()
+        with paddle.amp.auto_cast(True):
+            y_pred = model(x)
+            loss = y_pred.mean()
+        if enable_autocast:
+            scaler.scale(loss).backward()
+            scaler.minimize(optimizer, loss)
+        else:
+            loss_.append(np.asarray(loss).tolist())
+            loss.backward()
+            optimizer.step()
 
         param_.append(np.asarray(model.parameters()[9]).tolist())
         grad_.append(np.asarray(model.parameters()[3]._grad_ivar()).tolist())
@@ -138,25 +143,57 @@ def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
             self.assertEqual(param_ref, param)
             self.assertEqual(grad_ref, grad)
 
-        cuda_state = paddle.get_cuda_rng_state()
+        # without recompute
+        loss_ref, param_ref, grad_ref = run_model(recompute_block=[])
+
+        # recompute second block
+        loss, param, grad = run_model(recompute_block=[1])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute fourth block
+        loss, param, grad = run_model(recompute_block=[3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute second to fourth block
+        loss, param, grad = run_model(recompute_block=[1, 2, 3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute second & fourth block
+        loss, param, grad = run_model(recompute_block=[1, 3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+    def test_fc_net_without_restore_rng(self):
+        loss_ref, param_ref, grad_ref = run_model(
+            recompute_block=[2],
+            recompute_kwargs={"preserve_rng_state": False},
+            enable_autocast=True)
+
+    def test_fc_net_with_amp(self):
+        def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
+            self.assertEqual(loss_ref, loss)
+            self.assertEqual(param_ref, param)
+            self.assertEqual(grad_ref, grad)
+
         # without recompute
         loss_ref, param_ref, grad_ref = run_model(
-            cuda_state, recompute_block=[])
+            recompute_block=[], enable_autocast=True)
 
         # recompute second block
-        loss, param, grad = run_model(cuda_state, recompute_block=[1, 3])
+        loss, param, grad = run_model(recompute_block=[1], enable_autocast=True)
         check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
         # recompute fourth block
-        loss, param, grad = run_model(cuda_state, recompute_block=[3])
+        loss, param, grad = run_model(recompute_block=[3], enable_autocast=True)
         check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
         # recompute second to fourth block
-        loss, param, grad = run_model(cuda_state, recompute_block=[1, 2, 3])
+        loss, param, grad = run_model(
+            recompute_block=[1, 2, 3], enable_autocast=True)
         check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
         # recompute second & fourth block
-        loss, param, grad = run_model(cuda_state, recompute_block=[1, 3])
+        loss, param, grad = run_model(
+            recompute_block=[1, 3], enable_autocast=True)
         check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
     def test_recompute_kwargs(self):
@@ -164,12 +201,12 @@ def test_recompute_kwargs(self):
         kwargs = {"is_test": False}
         with self.assertRaises(ValueError):
             loss_ref, param_ref, grad_ref = run_model(
-                None, recompute_block=[2], recompute_kwargs=kwargs)
+                recompute_block=[2], recompute_kwargs=kwargs)
 
     def test_recompute_cpu_rng(self):
         paddle.set_device("cpu")
         with self.assertRaises(RuntimeError):
-            loss_ref, param_ref, grad_ref = run_model(None, recompute_block=[2])
+            loss_ref, param_ref, grad_ref = run_model(recompute_block=[2])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index 84fee8ace3ec42..1cf0c145f830de 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -333,7 +333,7 @@ def outputs_type_of_output():
                     hidden = fluid.layers.fc(input=[word, memory],
                                              size=10,
                                              act='tanh')
-                    out = np.ones(1).astype('float32')
+                    out = numpy.ones(1).astype('float32')
                     drnn.update_memory(ex_mem=memory, new_mem=hidden)
                     drnn.output(hidden, out)
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
index 835f693ab6d7d8..1590d866b1c73c 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py
@@ -145,7 +145,7 @@ def executor_main(self):
     def pe_main(self):
         image, label, loss = simple_fc_net()
         loss.persistable = False
-        persitables, non_persistables = get_persistables_and_non_persistables(
+        persistables, non_persistables = get_persistables_and_non_persistables(
             fluid.default_main_program(), [loss.name])
 
         exe = fluid.Executor(self.place)
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
index 3bbc8df1882275..22126ce41d05cc 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -138,6 +138,7 @@ class TestFcOp_NumFlattenDims_NegOne(unittest.TestCase):
     def test_api(self):
         def run_program(num_flatten_dims):
             paddle.seed(SEED)
+            np.random.seed(SEED)
             startup_program = Program()
             main_program = Program()
 
@@ -158,6 +159,7 @@ def run_program(num_flatten_dims):
             exe = fluid.Executor(place=place)
             exe.run(startup_program)
             out = exe.run(main_program, feed={"x": input}, fetch_list=[out])
+            return out
 
         res_1 = run_program(-1)
         res_2 = run_program(2)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index be5e87b9d344bb..af020548af376b 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -530,7 +530,8 @@ def test_sharding_with_pp(self):
             'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'c_gen_nccl_id', 'c_comm_init', 'fill_constant', 'c_allreduce_sum',
-            'c_gen_nccl_id', 'c_comm_init', 'fill_constant', 'c_allreduce_sum',
+            'c_sync_calc_stream', 'c_gen_nccl_id', 'c_comm_init',
+            'fill_constant', 'c_allreduce_sum', 'c_sync_calc_stream',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init'
         ])
 
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index f258e830b5fe5f..14f5d4a41a1fed 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -133,7 +133,7 @@ def check_sparse_gradient_clip(self, place):
         print(val)
         self.assertFalse(np.isnan(val))
 
-    def backward_and_optimize(cost):
+    def backward_and_optimize(self, cost):
         pass
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_memcpy_op.py b/python/paddle/fluid/tests/unittests/test_memcpy_op.py
index a089b33b8ea632..38e9379bc16677 100755
--- a/python/paddle/fluid/tests/unittests/test_memcpy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_memcpy_op.py
@@ -171,6 +171,14 @@ def test_OTHER_PLACE_NotImplementedError(self):
                 fetch_list=[lod_tensor_var.name, pinned_var.name])
 
 
+class TestMemcpyApi(unittest.TestCase):
+    def test_api(self):
+        a = paddle.ones([1024, 1024])
+        b = paddle.tensor.creation._memcpy(a, paddle.CUDAPinnedPlace())
+        self.assertEqual(b.place.__repr__(), "CUDAPinnedPlace")
+        self.assertTrue(np.array_equal(a.numpy(), b.numpy()))
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index ba4c1458c7791d..e79f6e5eb4a069 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -134,6 +134,64 @@ def test_check_output(self):
         self.check_output()
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestLarsMomentumOpWithMP(OpTest):
+    def setUp(self):
+        self.op_type = "lars_momentum"
+
+        master_param = np.random.random((123, 321)).astype("float32")
+        param = master_param.astype("float16")
+        grad = np.random.random((123, 321)).astype("float16")
+        velocity = np.zeros((123, 321)).astype("float32")
+        learning_rate = np.array([0.001]).astype("float32")
+        mu = 0.0001
+        lars_coeff = 0.001
+        lars_weight_decay = 0.0005
+        rescale_grad = 1.0
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Velocity': velocity,
+            'LearningRate': learning_rate,
+            'MasterParam': master_param,
+        }
+
+        self.attrs = {
+            'mu': mu,
+            'lars_coeff': lars_coeff,
+            'lars_weight_decay': lars_weight_decay,
+            'multi_precision': True,
+            'rescale_grad': rescale_grad
+        }
+
+        fp32_grad = grad.astype("float32")
+        pnorm = np.sqrt(np.square(master_param).sum())
+        gnorm = np.sqrt(np.square(fp32_grad).sum())
+        local_lr = learning_rate * lars_coeff * pnorm / (
+            gnorm + lars_weight_decay * pnorm)
+        fp32_grad = fp32_grad * rescale_grad
+        velocity_out = mu * velocity + local_lr * (fp32_grad + lars_weight_decay
+                                                   * master_param)
+        p_new = master_param - velocity_out
+        param_out = p_new.astype("float16")
+        master_param_out = p_new
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'VelocityOut': velocity_out,
+            'MasterParamOut': master_param_out
+        }
+
+    def test_check_output(self):
+        paddle.enable_static()
+        if core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place)
+
+
 class TestLarsMomentumOp(OpTest):
     def setUp(self):
         self.op_type = "lars_momentum"
@@ -555,6 +613,77 @@ def test_momentum_static(self):
                 exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
 
 
+class TestFusedMomentumWithDecayAPI(unittest.TestCase):
+    def get_program(self, weight_attr, bias_attr=False):
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(
+                main_program=main_program, startup_program=startup_program):
+            x = paddle.static.data(name='x', shape=[10, 10])
+            linear = paddle.nn.Linear(
+                10, 10, weight_attr=weight_attr, bias_attr=bias_attr)
+            out = linear(x)
+            loss = paddle.mean(out)
+            optimizer = paddle.optimizer.Momentum(
+                learning_rate=0.01,
+                momentum=0.9,
+                weight_decay=paddle.regularizer.L2Decay(0.5))
+            optimizer.minimize(loss)
+        return main_program
+
+    def test_param_has_l2decay(self):
+        paddle.enable_static()
+        weight_attr = paddle.ParamAttr(
+            name="weight",
+            initializer=paddle.nn.initializer.Constant(value=0.5),
+            regularizer=paddle.regularizer.L2Decay(0.1))
+        program = self.get_program(weight_attr, bias_attr=False)
+        ops = program.global_block().ops
+
+        self.assertEqual(ops[-1].attr('regularization_method'), 'l2_decay')
+        self.assertEqual(ops[-1].attr('regularization_coeff'), np.float32(0.1))
+        for i in range(len(ops)):
+            self.assertTrue('sum' not in ops[i].type)
+            self.assertTrue('scale' not in ops[i].type)
+
+    def test_param_has_l1decay(self):
+        paddle.enable_static()
+        weight_attr = paddle.ParamAttr(
+            name="weight",
+            initializer=paddle.nn.initializer.Constant(value=0.5),
+            regularizer=paddle.regularizer.L1Decay(0.1))
+        bias_attr = paddle.ParamAttr(
+            name="bias",
+            initializer=paddle.nn.initializer.Constant(value=0.),
+            regularizer=None)
+        program = self.get_program(weight_attr, bias_attr)
+        ops = program.global_block().ops
+
+        self.assertEqual(ops[-1].type, 'momentum')
+        self.assertEqual(ops[-2].type, 'momentum')
+        self.assertEqual(ops[-3].type, 'sum')
+        self.assertEqual(ops[-4].type, 'scale')
+        self.assertEqual(ops[-5].type, 'sign')
+        self.assertEqual(ops[-6].type, 'matmul_grad')
+        if 'weight' in ops[-1].input('Param'):
+            self.assertEqual(ops[-1].attr('regularization_method'), '')
+            self.assertEqual(ops[-1].attr('regularization_coeff'), 0)
+        if 'bias' in ops[-2].input('Param'):
+            self.assertEqual(ops[-2].attr('regularization_method'), 'l2_decay')
+            self.assertEqual(ops[-2].attr('regularization_coeff'),
+                             np.float32(0.5))
+
+    def test_param_has_no_regularizer(self):
+        paddle.enable_static()
+        program = self.get_program(weight_attr=None)
+        ops = program.global_block().ops
+        self.assertEqual(ops[-1].attr('regularization_method'), 'l2_decay')
+        self.assertEqual(ops[-1].attr('regularization_coeff'), np.float32(0.5))
+        for i in range(len(ops)):
+            self.assertTrue('sum' not in ops[i].type)
+            self.assertTrue('scale' not in ops[i].type)
+
+
 class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
     def __update_params(self, momentum, linear):
         for i in range(10):
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
index 977882543a8886..4c69d003d80f8b 100755
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
@@ -330,5 +330,19 @@ def test_main(self):
             self.run_main(num_workers)
 
 
+class TestDataLoaderGenerateStates(unittest.TestCase):
+    def setUp(self):
+        self.inputs = [(0, 1), (0, 2), (1, 3)]
+        self.outputs = [[1835504127, 1731038949, 1320224556, 2330041505],
+                        [2834126987, 2358157858, 1860244682, 1437227251],
+                        [457190280, 2660306227, 859341110, 354512857]]
+
+    def test_main(self):
+        from paddle.fluid.dataloader.worker import _generate_states
+        for inp, outp in zip(self.inputs, self.outputs):
+            out = _generate_states(*inp)
+            assert out == outp
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
index 95e2462a2e2989..c3b53e81a66659 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
@@ -25,7 +25,7 @@ class ReaderException(Exception):
     pass
 
 
-class TestMultiprocessReaderException(unittest.TestCase):
+class TestMultiprocessReaderExceptionWithQueueSuccess(unittest.TestCase):
     def setUp(self):
         self.use_pipe = False
         self.raise_exception = False
@@ -36,7 +36,7 @@ def places(self):
         else:
             return [fluid.CPUPlace()]
 
-    def main_impl(self, place, iterable, use_legacy_py_reader):
+    def main_impl(self, place, iterable):
         sample_num = 40
         batch_size = 4
 
@@ -53,37 +53,25 @@ def __impl__():
             return __impl__
 
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            if not use_legacy_py_reader:
-                image = fluid.data(
-                    name='image', dtype='float32', shape=[None, 10])
-
-                reader = fluid.io.PyReader(
-                    feed_list=[image], capacity=2, iterable=iterable)
-            else:
-                reader = fluid.layers.py_reader(
-                    capacity=2, shapes=[[-1, 10], ], dtypes=['float32', ])
-                image = fluid.layers.read_file(reader)
+            image = fluid.data(name='image', dtype='float32', shape=[None, 10])
+            reader = fluid.io.DataLoader.from_generator(
+                feed_list=[image], capacity=2, iterable=iterable)
 
             image_p_1 = image + 1
 
             decorated_reader = multiprocess_reader(
                 [fake_reader(), fake_reader()], use_pipe=self.use_pipe)
 
-            if use_legacy_py_reader:
-                reader.decorate_paddle_reader(
-                    fluid.io.batch(
-                        decorated_reader, batch_size=batch_size))
+            if isinstance(place, fluid.CUDAPlace):
+                reader.set_sample_generator(
+                    decorated_reader,
+                    batch_size=batch_size,
+                    places=fluid.cuda_places(0))
             else:
-                if isinstance(place, fluid.CUDAPlace):
-                    reader.decorate_sample_generator(
-                        decorated_reader,
-                        batch_size=batch_size,
-                        places=fluid.cuda_places(0))
-                else:
-                    reader.decorate_sample_generator(
-                        decorated_reader,
-                        batch_size=batch_size,
-                        places=fluid.cpu_places())
+                reader.set_sample_generator(
+                    decorated_reader,
+                    batch_size=batch_size,
+                    places=fluid.cpu_places(1))
 
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -97,9 +85,9 @@ def __impl__():
                         for data in reader():
                             exe.run(feed=data, fetch_list=[image_p_1])
                             num += 1
-                        self.assertEquals(num, batch_num)
+                        self.assertEqual(num, batch_num)
                     except SystemError as ex:
-                        self.assertEquals(num, 0)
+                        self.assertEqual(num, 0)
                         raise ReaderException()
             else:
                 for _ in range(3):
@@ -112,40 +100,40 @@ def __impl__():
                     except fluid.core.EOFException:
                         reader.reset()
                         self.assertFalse(self.raise_exception)
-                        self.assertEquals(num, batch_num)
+                        self.assertEqual(num, batch_num)
                     except SystemError as ex:
                         self.assertTrue(self.raise_exception)
-                        self.assertEquals(num, 0)
+                        self.assertEqual(num, 0)
                         raise ReaderException()
 
     def test_main(self):
         for p in self.places():
             for iterable in [False, True]:
-                use_legacy_py_reader_range = [False
-                                              ] if iterable else [False, True]
-                for use_legacy_py_reader in use_legacy_py_reader_range:
-                    try:
-                        with fluid.scope_guard(fluid.Scope()):
-                            self.main_impl(p, iterable, use_legacy_py_reader)
+                try:
+                    with fluid.scope_guard(fluid.Scope()):
+                        self.main_impl(p, iterable)
 
-                        self.assertTrue(not self.raise_exception)
-                    except ReaderException:
-                        self.assertTrue(self.raise_exception)
+                    self.assertTrue(not self.raise_exception)
+                except ReaderException:
+                    self.assertTrue(self.raise_exception)
 
 
-class TestCase1(TestMultiprocessReaderException):
+class TestMultiprocessReaderExceptionWithQueueFailed(
+        TestMultiprocessReaderExceptionWithQueueSuccess):
     def setUp(self):
         self.use_pipe = False
         self.raise_exception = True
 
 
-class TestCase2(TestMultiprocessReaderException):
+class TestMultiprocessReaderExceptionWithPipeSuccess(
+        TestMultiprocessReaderExceptionWithQueueSuccess):
     def setUp(self):
         self.use_pipe = True
         self.raise_exception = False
 
 
-class TestCase3(TestMultiprocessReaderException):
+class TestMultiprocessReaderExceptionWithPipeFailed(
+        TestMultiprocessReaderExceptionWithQueueSuccess):
     def setUp(self):
         self.use_pipe = True
         self.raise_exception = True
diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py
index 1673002cb79045..cb7e673c6ca29c 100644
--- a/python/paddle/fluid/tests/unittests/test_nan_inf.py
+++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py
@@ -29,11 +29,10 @@ def setUp(self):
         self._python_interp = sys.executable
         if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
             self._python_interp += " -m coverage run --branch -p"
-        self._python_interp += " check_nan_inf_base.py"
 
         self.env = os.environ.copy()
 
-    def test_nan_inf(self):
+    def check_nan_inf(self):
         cmd = self._python_interp
 
         proc = subprocess.Popen(
@@ -53,6 +52,14 @@ def test_nan_inf(self):
         assert (out + err
                 ).find('There are `nan` or `inf` in tensor'.encode()) != -1
 
+    def test_nan_inf_in_static_mode(self):
+        self._python_interp += " check_nan_inf_base.py"
+        self.check_nan_inf()
+
+    def test_nan_inf_in_dynamic_mode(self):
+        self._python_interp += " check_nan_inf_base_dygraph.py"
+        self.check_nan_inf()
+
 
 class TestNanInfEnv(TestNanInf):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_neg_op.py b/python/paddle/fluid/tests/unittests/test_neg_op.py
new file mode 100644
index 00000000000000..e7b16bde023578
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_neg_op.py
@@ -0,0 +1,91 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+
+
+class TestNegOp(unittest.TestCase):
+    def setUp(self):
+        self.init_dtype_type()
+        self.input = (np.random.random((32, 8)) * 100).astype(self.dtype)
+
+    def init_dtype_type(self):
+        self.dtype = np.float64
+
+    def run_imperative(self):
+        input = paddle.to_tensor(self.input)
+        dy_result = paddle.neg(input)
+        expected_result = np.negative(self.input)
+        self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
+
+    def run_static(self, use_gpu=False):
+        input = paddle.fluid.data(name='input', shape=[32, 8], dtype=self.dtype)
+        result = paddle.neg(input)
+
+        place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(paddle.static.default_startup_program())
+        st_result = exe.run(feed={"input": self.input}, fetch_list=[result])
+        expected_result = np.negative(self.input)
+        self.assertTrue(np.allclose(st_result[0], expected_result))
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.CPUPlace())
+        self.run_imperative()
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not paddle.fluid.core.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        self.run_imperative()
+        paddle.enable_static()
+
+        with paddle.static.program_guard(paddle.static.Program()):
+            self.run_static(use_gpu=True)
+
+
+class TestNegOpFp32(TestNegOp):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+
+class TestNegOpInt64(TestNegOp):
+    def init_dtype_type(self):
+        self.dtype = np.int64
+
+
+class TestNegOpInt32(TestNegOp):
+    def init_dtype_type(self):
+        self.dtype = np.int32
+
+
+class TestNegOpInt16(TestNegOp):
+    def init_dtype_type(self):
+        self.dtype = np.int16
+
+
+class TestNegOpInt8(TestNegOp):
+    def init_dtype_type(self):
+        self.dtype = np.int8
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_quant_functional_layers.py b/python/paddle/fluid/tests/unittests/test_nn_quant_functional_layers.py
new file mode 100644
index 00000000000000..86dc43bacf86be
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_nn_quant_functional_layers.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+
+
+class TestFunctionalLayers(unittest.TestCase):
+    """
+    """
+
+    def setUp(self):
+        paddle.disable_static()
+        np.random.seed(1)
+
+        shape = [3, 100, 120]
+        self.x = paddle.to_tensor(np.random.random(shape))
+        self.y = paddle.to_tensor(np.random.random(shape))
+
+    def check(self, x, y):
+        self.assertTrue(np.allclose(x.numpy(), y.numpy()))
+
+    def test_quant_add(self):
+        out_1 = paddle.add(self.x, self.y)
+        out_2 = paddle.nn.quant.add()(self.x, self.y)
+        self.check(out_1, out_2)
+
+    def test_quant_subtract(self):
+        out_1 = paddle.subtract(self.x, self.y)
+        out_2 = paddle.nn.quant.subtract()(self.x, self.y)
+        self.check(out_1, out_2)
+
+    def test_quant_multiply(self):
+        out_1 = paddle.multiply(self.x, self.y)
+        out_2 = paddle.nn.quant.multiply()(self.x, self.y)
+        self.check(out_1, out_2)
+
+    def test_quant_divide(self):
+        out_1 = paddle.divide(self.x, self.y)
+        out_2 = paddle.nn.quant.divide()(self.x, self.y)
+        self.check(out_1, out_2)
+
+    def test_quant_reshape(self):
+        reshape = [120, 300]
+        out_1 = paddle.reshape(self.x, reshape)
+        out_2 = paddle.nn.quant.reshape()(self.x.clone(), reshape)
+        self.check(out_1, out_2)
+        self.assertTrue(out_1.shape == out_2.shape)
+
+    def test_quant_transpose(self):
+        perm = [1, 2, 0]
+        out_1 = paddle.transpose(self.x, perm)
+        out_2 = paddle.nn.quant.transpose()(self.x.clone(), perm)
+        self.check(out_1, out_2)
+        self.assertTrue(out_1.shape == out_2.shape)
+
+    def test_quant_concat(self):
+        out_1 = paddle.concat([self.x, self.y], axis=0)
+        out_2 = paddle.nn.quant.concat()([self.x, self.y], 0)
+        self.check(out_1, out_2)
+        self.assertTrue(out_1.shape == out_2.shape)
+
+    def test_quant_flatten(self):
+        start_axis = 1
+        end_axis = 2
+        out_1 = paddle.flatten(self.x, start_axis, end_axis)
+        out_2 = paddle.nn.quant.flatten()(self.x.clone(), start_axis, end_axis)
+        self.check(out_1, out_2)
+        self.assertTrue(out_1.shape == out_2.shape)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_onnx_export.py b/python/paddle/fluid/tests/unittests/test_onnx_export.py
index 79d36063d77d5b..0985ed33af376c 100644
--- a/python/paddle/fluid/tests/unittests/test_onnx_export.py
+++ b/python/paddle/fluid/tests/unittests/test_onnx_export.py
@@ -47,7 +47,7 @@ def setUp(self):
         self.x_spec = paddle.static.InputSpec(
             shape=[None, 128], dtype='float32')
 
-    def test_with_tensor():
+    def test_with_tensor(self):
         model = LinearNet()
         paddle.onnx.export(model, 'linear_net', input_spec=[self.x_spec])
 
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index be2a6a653cc6f4..594d0db035c6a5 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -19,6 +19,7 @@
 import os
 import sys
 import six
+from io import BytesIO
 
 import paddle
 import paddle.nn as nn
@@ -760,6 +761,71 @@ def test_varbase_binary_var(self):
         self.assertTrue(np.array_equal(origin_array, load_tensor_array))
 
 
+class TestSaveLoadToMemory(unittest.TestCase):
+    def test_dygraph_save_to_memory(self):
+        paddle.disable_static()
+        linear = LinearNet()
+        state_dict = linear.state_dict()
+        byio = BytesIO()
+        paddle.save(state_dict, byio)
+        tensor = paddle.randn([2, 3], dtype='float32')
+        paddle.save(tensor, byio)
+        byio.seek(0)
+        # load state_dict
+        dict_load = paddle.load(byio, return_numpy=True)
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(v.numpy(), dict_load[k]))
+        # load tensor
+        tensor_load = paddle.load(byio, return_numpy=True)
+        self.assertTrue(np.array_equal(tensor_load, tensor.numpy()))
+
+        with self.assertRaises(ValueError):
+            paddle.save(4, 3)
+        with self.assertRaises(ValueError):
+            paddle.save(state_dict, '')
+        with self.assertRaises(ValueError):
+            paddle.fluid.io._open_file_buffer('temp', 'b')
+
+    def test_static_save_to_memory(self):
+        paddle.enable_static()
+        with new_program_scope():
+            # create network
+            x = paddle.static.data(
+                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            z = paddle.static.nn.fc(x, 10, bias_attr=False)
+            z = paddle.static.nn.fc(z, 128, bias_attr=False)
+            loss = fluid.layers.reduce_mean(z)
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            prog = paddle.static.default_main_program()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+
+            state_dict = prog.state_dict()
+            keys = list(state_dict.keys())
+            tensor = state_dict[keys[0]]
+
+            byio = BytesIO()
+            byio2 = BytesIO()
+            paddle.save(prog, byio2)
+            paddle.save(tensor, byio)
+            paddle.save(state_dict, byio)
+            byio.seek(0)
+            byio2.seek(0)
+
+            prog_load = paddle.load(byio2)
+            self.assertTrue(prog.desc.serialize_to_string() ==
+                            prog_load.desc.serialize_to_string())
+
+            tensor_load = paddle.load(byio, return_numpy=True)
+            self.assertTrue(np.array_equal(tensor_load, np.array(tensor)))
+
+            state_dict_load = paddle.load(byio, return_numpy=True)
+            for k, v in state_dict.items():
+                self.assertTrue(np.array_equal(np.array(v), state_dict_load[k]))
+
+
 class TestSaveLoad(unittest.TestCase):
     def setUp(self):
         # enable dygraph mode
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
index 7385da56beab3e..0b9e038f7cd95e 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
@@ -16,6 +16,7 @@
 
 import unittest
 import numpy as np
+from io import BytesIO
 import os
 import sys
 import six
@@ -176,13 +177,27 @@ def test_save_load_lod_tensor(self):
             paddle.save(temp_lod, path, use_binary_format=True)
 
         with self.assertRaises(RuntimeError):
-            fluid.core._save_lod_tensor(
+            fluid.core.save_lod_tensor(
                 temp_lod, 'test_save_load_error_not_exist_file/not_exist_file')
 
         with self.assertRaises(RuntimeError):
-            fluid.core._load_lod_tensor(
+            fluid.core.load_lod_tensor(
                 temp_lod, 'test_save_load_error_not_exist_file/not_exist_file')
 
+        # save to memory
+        byio = BytesIO()
+        paddle.save(tensor, byio, use_binary_format=True)
+        byio.seek(0)
+        # load from memory
+        loaded_tensor_mem = paddle.load(byio)
+        to_array_mem = np.array(loaded_tensor_mem)
+        self.assertTrue(np.array_equal(np.array(tensor), to_array_mem))
+
+        with self.assertRaises(NotImplementedError):
+            paddle.framework.io._save_lod_tensor(tensor, 1)
+        with self.assertRaises(NotImplementedError):
+            paddle.framework.io._load_lod_tensor(1)
+
     def test_save_load_selected_rows(self):
         paddle.enable_static()
         place = fluid.CPUPlace() if not paddle.fluid.core.is_compiled_with_cuda(
@@ -210,10 +225,28 @@ def test_save_load_selected_rows(self):
             np.array_equal(np.array(load_sr.get_tensor()), np_array))
 
         with self.assertRaises(RuntimeError):
-            fluid.core._save_selected_rows(
+            fluid.core.save_selected_rows(
                 selected_rows,
                 'test_paddle_save_load_selected_rows_not_exist_file/temp')
         with self.assertRaises(RuntimeError):
-            fluid.core._load_selected_rows(
+            fluid.core.load_selected_rows(
                 selected_rows,
                 'test_paddle_save_load_selected_rows_not_exist_file/temp')
+
+        # save to memory
+        byio = BytesIO()
+        paddle.save(selected_rows, byio, use_binary_format=True)
+        byio.seek(0)
+        # load from memory
+        selected_rows_mem = paddle.load(byio)
+        to_array_mem = np.array(selected_rows_mem)
+        self.assertTrue(isinstance(selected_rows_mem, fluid.core.SelectedRows))
+        self.assertTrue(list(selected_rows_mem.rows()) == rows)
+        self.assertTrue(selected_rows_mem.height() == height)
+        self.assertTrue(
+            np.array_equal(np.array(selected_rows_mem.get_tensor()), np_array))
+
+        with self.assertRaises(NotImplementedError):
+            paddle.framework.io._save_selected_rows(selected_rows, 1)
+        with self.assertRaises(NotImplementedError):
+            paddle.framework.io._load_selected_rows(1)
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline.py b/python/paddle/fluid/tests/unittests/test_pipeline.py
index cd592416c1a512..1be10113a5591c 100644
--- a/python/paddle/fluid/tests/unittests/test_pipeline.py
+++ b/python/paddle/fluid/tests/unittests/test_pipeline.py
@@ -44,6 +44,15 @@ def test_dist_train(self):
                 check_error_log=True,
                 log_name=flag_name)
 
+    def test_dist_train_multi_device(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "pipeline_mnist_multi_device.py",
+                check_error_log=True,
+                delta=1e0,
+                log_name=flag_name)
+
     def test_dist_train_one_device(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_pylayer_op.py b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
index e058115d691993..a852b4c90421ac 100644
--- a/python/paddle/fluid/tests/unittests/test_pylayer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
@@ -21,6 +21,11 @@
 from paddle.autograd import PyLayer
 
 
+class FakeTensor(paddle.fluid.core.VarBase):
+    def __init__(self):
+        pass
+
+
 class TestPyLayer(unittest.TestCase):
     def test_simple_pylayer_multiple_output(self):
         class tanh(PyLayer):
@@ -426,6 +431,129 @@ def backward(ctx, dy):
             z = paddle.tanh(data)
             z = cus_tanh.apply(data)
 
+    def test_return_to_tensor(self):
+        class Tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1):
+                y1 = paddle.tanh(x1)
+                ctx.save_for_backward(y1)
+                tensor_1 = paddle.to_tensor([1, 2], dtype='float32')
+                return y1, 5, None, "helloworld", tensor_1
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                y1, = ctx.saved_tensor()
+                re1 = dy1 * (1 - paddle.square(y1))
+                return dy1
+
+        input1 = paddle.randn([2, 3]).astype("float32")
+        input2 = input1.detach().clone()
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+        z, number, none_item, string_item, tensor1 = Tanh.apply(x1=input1)
+        z.mean().backward()
+
+
+class TestPyLayerReturnType(unittest.TestCase):
+    def test_forward_args_fake_tensor(self):
+        class Tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1):
+                y1 = FakeTensor()
+                return y1, x1
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                return dy1
+
+        input1 = FakeTensor()
+
+        with self.assertRaises(ValueError):
+            y1, y2 = Tanh.apply(input1)
+
+    def test_forward_kwargs_fake_tensor(self):
+        class Tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1):
+
+                return x1
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                return dy1
+
+        input1 = FakeTensor()
+
+        with self.assertRaises(ValueError):
+            y = Tanh.apply(x1=input1)
+
+    def test_forward_return_fake_tensor(self):
+        class Tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1):
+
+                return FakeTensor()
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                return dy1
+
+        input1 = paddle.randn([3, 2])
+
+        with self.assertRaises(ValueError):
+            y = Tanh.apply(x1=input1)
+
+    def test_forward_return_fake_tensor_tuple(self):
+        class Tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1):
+
+                return FakeTensor(), FakeTensor()
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                return dy1
+
+        input1 = paddle.randn([3, 2])
+
+        with self.assertRaises(ValueError):
+            y = Tanh.apply(x1=input1)
+
+    def test_backward_return_fake_tensor_tuple(self):
+        class Tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2):
+                return x1 + 1, x1 + 2
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+
+                return FakeTensor(), 2
+
+        input1 = paddle.randn([3, 2])
+        input1.stop_gradient = False
+        y, _ = Tanh.apply(input1, 1 + input1)
+
+        with self.assertRaises(ValueError):
+            y.mean().backward()
+
+    def test_backward_return_fake_tensor(self):
+        class Tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1):
+                return x1 + 1, x1 + 2
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                return FakeTensor()
+
+        input1 = paddle.randn([3, 2])
+        input1.stop_gradient = False
+        y, _ = Tanh.apply(input1)
+
+        with self.assertRaises(ValueError):
+            y.mean().backward()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_raw_program_optimizer.py b/python/paddle/fluid/tests/unittests/test_raw_program_optimizer.py
new file mode 100644
index 00000000000000..34930e3577b9b5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_raw_program_optimizer.py
@@ -0,0 +1,77 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import paddle.distributed.fleet as fleet
+import numpy as np
+import os
+
+
+class TestRawProgramOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001"
+
+    def mlp(self, input_x, input_y, hid_dim=128, label_dim=2):
+        fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim, activation='tanh')
+        fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim, activation='tanh')
+        prediction = paddle.static.nn.fc(x=[fc_2],
+                                         size=label_dim,
+                                         activation='softmax')
+        cost = paddle.nn.functional.cross_entropy(
+            input=prediction, label=input_y)
+        avg_cost = paddle.mean(x=cost)
+        return avg_cost
+
+    def gen_data(self):
+        return {
+            "x": np.random.random(size=(128, 32)).astype('float32'),
+            "y": np.random.randint(
+                2, size=(128, 1)).astype('int64')
+        }
+
+    def test_single_gpu(self):
+        paddle.enable_static()
+        fleet.init(is_collective=True)
+        sharding_program = paddle.static.Program()
+        sharding_startup_program = paddle.static.Program()
+        strategy = fleet.DistributedStrategy()
+        strategy.without_graph_optimization = True
+        with fluid.program_guard(sharding_program, sharding_startup_program):
+            with fluid.unique_name.guard():
+                input_x = paddle.static.data(
+                    name="x", shape=[None, 32], dtype='float32')
+                input_y = paddle.static.data(
+                    name="y", shape=[None, 1], dtype='int64')
+                cost = self.mlp(input_x=input_x, input_y=input_y)
+                output_name = cost.name
+                optimizer = fleet.distributed_optimizer(fluid.optimizer.Adam(),
+                                                        strategy)
+                optimizer.minimize(cost)
+
+        trainer_id = fleet.worker_index()
+        exe = paddle.static.Executor(paddle.CUDAPlace(trainer_id))
+        rank = fleet.worker_index()
+        exe.run(sharding_startup_program)
+        exe.run(program=sharding_program, feed=self.gen_data())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
index edd69d67aaf4b6..08a70fe1852d02 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -59,6 +59,7 @@ def test_l2decay_regularizer(self):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
+        optimizer = paddle.optimizer.Adam()
         params_grads = optimizer.append_regularization_ops(params_grads)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(block.ops), count_ops + 2)
@@ -97,6 +98,7 @@ def test_l2decay_regularizer(self):
         params_grads = append_backward(mean_out)
         self.assertEqual(len(params_grads), 1)
         count_ops = len(block.ops)
+        optimizer = paddle.optimizer.Adam()
         params_grads = optimizer.append_regularization_ops(params_grads)
         self.assertEqual(len(params_grads), 1)
         self.assertEqual(len(block.ops), count_ops + 3)
diff --git a/python/paddle/fluid/tests/unittests/test_roi_align_op.py b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
index 7d030855d114ee..7fab4017ab0ba1 100644
--- a/python/paddle/fluid/tests/unittests/test_roi_align_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_align_op.py
@@ -129,8 +129,9 @@ def calc_roi_align(self):
 
             roi_width = roi_xmax - roi_xmin
             roi_height = roi_ymax - roi_ymin
-            roi_width = max(roi_width, 1)
-            roi_height = max(roi_height, 1)
+            if not self.aligned:
+                roi_width = max(roi_width, 1)
+                roi_height = max(roi_height, 1)
 
             bin_size_h = float(roi_height) / float(self.pooled_height)
             bin_size_w = float(roi_width) / float(self.pooled_width)
@@ -138,7 +139,7 @@ def calc_roi_align(self):
                                  math.ceil(roi_height / self.pooled_height)
             roi_bin_grid_w = self.sampling_ratio if self.sampling_ratio > 0 else \
                                  math.ceil(roi_width / self.pooled_width)
-            count = int(roi_bin_grid_h * roi_bin_grid_w)
+            count = max(int(roi_bin_grid_h * roi_bin_grid_w), 1)
             pre_size = count * self.pooled_width * self.pooled_height
             bilinear_pos, bilinear_w = self.pre_calc(x_i, roi_xmin, roi_ymin,
                                                      int(roi_bin_grid_h),
diff --git a/python/paddle/fluid/tests/unittests/test_run_program_op.py b/python/paddle/fluid/tests/unittests/test_run_program_op.py
index f6332859f92f7a..81490642fa8c12 100644
--- a/python/paddle/fluid/tests/unittests/test_run_program_op.py
+++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py
@@ -19,10 +19,13 @@
 import numpy as np
 import six
 
+import paddle
 import paddle.fluid as fluid
 from paddle import compat as cpt
 from paddle.fluid import core, framework, executor
 
+paddle.enable_static()
+
 
 @contextlib.contextmanager
 def program_scope_guard():
@@ -164,6 +167,8 @@ def create_var_base(is_input, name):
             persistable=True)
         inner_scope = core.Scope()
         outputs['OutScope'].value().set_scope(inner_scope)
+
+        outputs['DOut'] = [create_var_base(False, "Fake_var")]
         return outputs
 
     def calc_dygraph_output(self, place):
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index bd784b65c10f00..b83478a5b8b0b0 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -22,6 +22,8 @@
 import paddle.fluid.layers as layers
 import paddle
 
+paddle.enable_static()
+
 
 # Situation 1: starts(list, no tensor), ends(list, no tensor)
 # 1.1 without attr(decrease)
@@ -683,6 +685,16 @@ def test_float_in_index():
         self.assertRaises(Exception, test_float_in_index)
 
 
+class TestInferShape(unittest.TestCase):
+    def test(self):
+        x = paddle.ones(shape=[3, 4, 5])
+        x.desc.set_shape([3, -1, 5])
+        self.assertEqual(x.shape, (3, -1, 5))
+
+        out0 = paddle.slice(x, axes=[1], starts=[0], ends=[3])
+        self.assertEqual(out0.shape, (3, 3, 5))
+
+
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestImperativeCUDAPinnedInput(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
index 71550c8f24753c..ebf7c01e2cae5f 100644
--- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
@@ -216,6 +216,71 @@ def initTestCase(self):
         self.infer_flags = [1, 1, 1, 1, 1]
 
 
+class TestStrideSliceOpBool(TestStrideSliceOp):
+    def test_check_grad(self):
+        pass
+
+
+class TestStrideSliceOpBool1D(TestStrideSliceOpBool):
+    def initTestCase(self):
+        self.input = np.random.rand(100).astype("bool")
+        self.axes = [0]
+        self.starts = [3]
+        self.ends = [8]
+        self.strides = [1]
+        self.infer_flags = [1]
+
+
+class TestStrideSliceOpBool2D(TestStrideSliceOpBool):
+    def initTestCase(self):
+        self.input = np.random.rand(10, 10).astype("bool")
+        self.axes = [0, 1]
+        self.starts = [1, 0]
+        self.ends = [2, 2]
+        self.strides = [1, 1]
+        self.infer_flags = [1, 1]
+
+
+class TestStrideSliceOpBool3D(TestStrideSliceOpBool):
+    def initTestCase(self):
+        self.input = np.random.rand(3, 4, 10).astype("bool")
+        self.axes = [0, 1, 2]
+        self.starts = [0, -1, 0]
+        self.ends = [2, -3, 5]
+        self.strides = [1, -1, 1]
+        self.infer_flags = [1, 1, 1]
+
+
+class TestStrideSliceOpBool4D(TestStrideSliceOpBool):
+    def initTestCase(self):
+        self.input = np.random.rand(3, 3, 3, 4).astype("bool")
+        self.axes = [0, 1, 2, 3]
+        self.starts = [1, 0, 0, 0]
+        self.ends = [2, 2, 3, 4]
+        self.strides = [1, 1, 1, 2]
+        self.infer_flags = [1, 1, 1, 1]
+
+
+class TestStrideSliceOpBool5D(TestStrideSliceOpBool):
+    def initTestCase(self):
+        self.input = np.random.rand(3, 3, 3, 4, 5).astype("bool")
+        self.axes = [0, 1, 2, 3, 4]
+        self.starts = [1, 0, 0, 0, 0]
+        self.ends = [2, 2, 3, 4, 4]
+        self.strides = [1, 1, 1, 1, 1]
+        self.infer_flags = [1, 1, 1, 1]
+
+
+class TestStrideSliceOpBool6D(TestStrideSliceOpBool):
+    def initTestCase(self):
+        self.input = np.random.rand(3, 3, 3, 6, 7, 8).astype("bool")
+        self.axes = [0, 1, 2, 3, 4, 5]
+        self.starts = [1, 0, 0, 0, 1, 2]
+        self.ends = [2, 2, 3, 1, 2, 8]
+        self.strides = [1, 1, 1, 1, 1, 2]
+        self.infer_flags = [1, 1, 1, 1, 1]
+
+
 class TestStridedSliceOp_starts_ListTensor(OpTest):
     def setUp(self):
         self.op_type = "strided_slice"
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py
index 5f2dfbdd99e161..ba375f8b3c8a41 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py
@@ -187,6 +187,13 @@ def test_tensor_div_scalar(self):
         c = paddle.full([2, 2, 2], 0.5, dtype="float32")
         self.check_operation(a, b, c, '/')
 
+        # tensor(float32) / scalar(int)
+        # this behavior should be equal to elementwise_div Op
+        a = paddle.to_tensor([99, 99, 99], dtype='float32')
+        b = 100
+        c = a / paddle.to_tensor([100, 100, 100], dtype='float32')
+        self.check_operation(a, b, c, '/')
+
         # tensor(int64) / scalar(float, .0)
         a = paddle.ones([2, 2, 2], dtype='int64')
         b = 2.0
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py
index d697666e12ddd1..aa24161687004b 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py
@@ -218,6 +218,12 @@ def test_tensor_div_scalar(self):
             c = paddle.full([2, 2, 2], 0.5, dtype="float32")
             self.check_operation(a, b, c, '/')
 
+            # this behavior should be equal to elementwise_div Op
+            a = paddle.full([2, 2, 2], 99, dtype="float32")
+            b = 100
+            c = a / paddle.full([2, 2, 2], 100, dtype="float32")
+            self.check_operation(a, b, c, '/')
+
         # tensor(int64) / scalar(float, .0)
         with program_guard(Program()):
             a = paddle.ones([2, 2, 2], dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
index cb5186468890d8..85d830485e23f1 100644
--- a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
+++ b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
@@ -72,7 +72,7 @@ def test_trace_err(self):
                     self.layer, 3)
             self.assertEqual(
                 "The type of 'each element of inputs' in fluid.dygraph.jit.TracedLayer.trace must be fluid.Variable, but received <{} 'int'>.".
-                format(self.type_str, self.type_str), str(e.exception))
+                format(self.type_str), str(e.exception))
             with self.assertRaises(TypeError) as e:
                 dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
                     self.layer, [True, 1])
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index b3671327ca2959..b8d29d482fefa9 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -230,6 +230,14 @@ def _test_place(place):
             _test_place(core.CUDAPlace(0))
             _test_place("gpu:0")
 
+    def test_to_tensor_not_change_input_stop_gradient(self):
+        with paddle.fluid.dygraph.guard(core.CPUPlace()):
+            a = paddle.zeros([1024])
+            a.stop_gradient = False
+            b = paddle.to_tensor(a)
+            self.assertEqual(a.stop_gradient, False)
+            self.assertEqual(b.stop_gradient, True)
+
     def test_to_tensor_change_place(self):
         if core.is_compiled_with_cuda():
             a_np = np.random.rand(1024, 1024)
@@ -260,8 +268,9 @@ def test_to_tensor_with_lodtensor(self):
             with paddle.fluid.dygraph.guard(core.CUDAPlace(0)):
                 lod_tensor = core.LoDTensor()
                 lod_tensor.set(a_np, core.CUDAPlace(0))
-                a = paddle.to_tensor(lod_tensor)
+                a = paddle.to_tensor(lod_tensor, place=core.CPUPlace())
                 self.assertTrue(np.array_equal(a_np, a.numpy()))
+                self.assertTrue(a.place.__repr__(), "CPUPlace")
 
     def test_to_variable(self):
         with fluid.dygraph.guard():
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index 71051689dbc157..c1956545f55ad1 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -17,6 +17,7 @@
 import unittest
 import paddle
 from paddle.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_, in_dygraph_mode
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
@@ -164,12 +165,125 @@ def _test_slice(self, place):
             self.assertTrue(
                 np.array_equal(local_out[15], tensor_array[::-1, ::-1, ::-1]))
 
-    def test_slice(self):
-        place = fluid.CPUPlace()
-        self._test_slice(place)
+    def _test_slice_index_tensor(self, place):
+        data = np.random.rand(2, 3).astype("float32")
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+            x = paddle.assign(data)
+            idx0 = [1, 0]
+            idx1 = [0, 1]
+            idx2 = [0, 0]
+            idx3 = [1, 1]
+
+            out0 = x[paddle.assign(np.array(idx0))]
+            out1 = x[paddle.assign(np.array(idx1))]
+            out2 = x[paddle.assign(np.array(idx2))]
+            out3 = x[paddle.assign(np.array(idx3))]
+
+        exe = paddle.static.Executor(place)
+        result = exe.run(prog, fetch_list=[out0, out1, out2, out3])
+
+        expected = [data[idx0], data[idx1], data[idx2], data[idx3]]
+
+        self.assertTrue((result[0] == expected[0]).all())
+        self.assertTrue((result[1] == expected[1]).all())
+        self.assertTrue((result[2] == expected[2]).all())
+        self.assertTrue((result[3] == expected[3]).all())
+
+        with self.assertRaises(IndexError):
+            one = paddle.ones(shape=[1])
+            res = x[one, [0, 0]]
+
+    def _test_slice_index_list(self, place):
+        data = np.random.rand(2, 3).astype("float32")
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+            x = paddle.assign(data)
+            idx0 = [1, 0]
+            idx1 = [0, 1]
+            idx2 = [0, 0]
+            idx3 = [1, 1]
+
+            out0 = x[idx0]
+            out1 = x[idx1]
+            out2 = x[idx2]
+            out3 = x[idx3]
+
+        exe = paddle.static.Executor(place)
+        result = exe.run(prog, fetch_list=[out0, out1, out2, out3])
+
+        expected = [data[idx0], data[idx1], data[idx2], data[idx3]]
+
+        self.assertTrue((result[0] == expected[0]).all())
+        self.assertTrue((result[1] == expected[1]).all())
+        self.assertTrue((result[2] == expected[2]).all())
+        self.assertTrue((result[3] == expected[3]).all())
+
+    def _test_slice_index_ellipsis(self, place):
+        data = np.random.rand(2, 3, 4).astype("float32")
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+            x = paddle.assign(data)
+            out1 = x[0:, ..., 1:]
+            out2 = x[0:, ...]
+            out3 = x[..., 1:]
+            out4 = x[...]
+
+        exe = paddle.static.Executor(place)
+        result = exe.run(prog, fetch_list=[out1, out2, out3, out4])
+
+        expected = [data[0:, ..., 1:], data[0:, ...], data[..., 1:], data[...]]
+
+        self.assertTrue((result[0] == expected[0]).all())
+        self.assertTrue((result[1] == expected[1]).all())
+        self.assertTrue((result[2] == expected[2]).all())
+        self.assertTrue((result[3] == expected[3]).all())
+
+        with self.assertRaises(IndexError):
+            res = x[[1, 0], [0, 0]]
+
+        with self.assertRaises(TypeError):
+            res = x[[1.2, 0]]
+
+    def _test_slice_index_list_bool(self, place):
+        data = np.random.rand(2, 3).astype("float32")
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+            x = paddle.assign(data)
+            idx0 = [True, False]
+            idx1 = [False, True]
+            idx2 = [False, False]
+            idx3 = [True, True]
+
+            out0 = x[idx0]
+            out1 = x[idx1]
+            out2 = x[idx2]
+            out3 = x[idx3]
+
+        exe = paddle.static.Executor(place)
+        result = exe.run(prog, fetch_list=[out0, out1, out2, out3])
+
+        expected = [data[idx0], data[idx1], data[idx2], data[idx3]]
+
+        self.assertTrue((result[0] == expected[0]).all())
+        self.assertTrue((result[1] == expected[1]).all())
+        self.assertTrue((result[2] == expected[2]).all())
+        self.assertTrue((result[3] == expected[3]).all())
+
+        with self.assertRaises(TypeError):
+            res = x[[True, 0]]
 
+    def test_slice(self):
+        places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
-            self._test_slice(core.CUDAPlace(0))
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            self._test_slice(place)
+            self._test_slice_index_tensor(place)
+            self._test_slice_index_list(place)
+            self._test_slice_index_ellipsis(place)
+            self._test_slice_index_list_bool(place)
 
     def _tostring(self):
         b = default_main_program().current_block()
@@ -232,5 +346,61 @@ def _test():
         self.assertRaises(Exception, _test)
 
 
+class TestVariableSlice(unittest.TestCase):
+    def _test_item_none(self, place):
+        data = np.random.rand(2, 3, 4).astype("float32")
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+            x = paddle.assign(data)
+            out0 = x[0:, None, 1:]
+            out1 = x[0:, None]
+            out2 = x[None, 1:]
+            out3 = x[None]
+
+        outs = [out0, out1, out2, out3]
+        exe = paddle.static.Executor(place)
+        result = exe.run(prog, fetch_list=outs)
+
+        expected = [
+            data[0:, None, 1:], data[0:, None], data[None, 1:], data[None]
+        ]
+        for i in range(len(outs)):
+            self.assertEqual(outs[i].shape, expected[i].shape)
+            self.assertTrue((result[i] == expected[i]).all())
+
+    def _test_item_none_and_decrease(self, place):
+        data = np.random.rand(2, 3, 4).astype("float32")
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+            x = paddle.assign(data)
+            out0 = x[0, 1:, None]
+            out1 = x[0, None]
+            out2 = x[None, 1]
+            out3 = x[None]
+            out4 = x[0, 0, 0, None]
+            out5 = x[None, 0, 0, 0, None]
+
+        outs = [out0, out1, out2, out3, out4, out5]
+        exe = paddle.static.Executor(place)
+        result = exe.run(prog, fetch_list=outs)
+        expected = [
+            data[0, 1:, None], data[0, None], data[None, 1], data[None],
+            data[0, 0, 0, None], data[None, 0, 0, 0, None]
+        ]
+
+        for i in range(len(outs)):
+            self.assertEqual(outs[i].shape, expected[i].shape)
+            self.assertTrue((result[i] == expected[i]).all())
+
+    def test_slice(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            self._test_item_none(place)
+            self._test_item_none_and_decrease(place)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
index 24c463ebfc9a13..5793f0148fc547 100644
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -35,10 +35,16 @@ def YoloBox(x, img_size, attrs):
     downsample = attrs['downsample']
     clip_bbox = attrs['clip_bbox']
     scale_x_y = attrs['scale_x_y']
+    iou_aware = attrs['iou_aware']
+    iou_aware_factor = attrs['iou_aware_factor']
     bias_x_y = -0.5 * (scale_x_y - 1.)
     input_h = downsample * h
     input_w = downsample * w
 
+    if iou_aware:
+        ioup = x[:, :an_num, :, :]
+        ioup = np.expand_dims(ioup, axis=-1)
+        x = x[:, an_num:, :, :]
     x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
 
     pred_box = x[:, :, :, :, :4].copy()
@@ -57,7 +63,11 @@ def YoloBox(x, img_size, attrs):
     pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
     pred_box[:, :, :, :, 3] = np.exp(pred_box[:, :, :, :, 3]) * anchor_h
 
-    pred_conf = sigmoid(x[:, :, :, :, 4:5])
+    if iou_aware:
+        pred_conf = sigmoid(x[:, :, :, :, 4:5])**(
+            1 - iou_aware_factor) * sigmoid(ioup)**iou_aware_factor
+    else:
+        pred_conf = sigmoid(x[:, :, :, :, 4:5])
     pred_conf[pred_conf < conf_thresh] = 0.
     pred_score = sigmoid(x[:, :, :, :, 5:]) * pred_conf
     pred_box = pred_box * (pred_conf > 0.).astype('float32')
@@ -97,6 +107,8 @@ def setUp(self):
             "downsample": self.downsample,
             "clip_bbox": self.clip_bbox,
             "scale_x_y": self.scale_x_y,
+            "iou_aware": self.iou_aware,
+            "iou_aware_factor": self.iou_aware_factor
         }
 
         self.inputs = {
@@ -123,6 +135,8 @@ def initTestCase(self):
         self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 13, 13)
         self.imgsize_shape = (self.batch_size, 2)
         self.scale_x_y = 1.
+        self.iou_aware = False
+        self.iou_aware_factor = 0.5
 
 
 class TestYoloBoxOpNoClipBbox(TestYoloBoxOp):
@@ -137,6 +151,8 @@ def initTestCase(self):
         self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 13, 13)
         self.imgsize_shape = (self.batch_size, 2)
         self.scale_x_y = 1.
+        self.iou_aware = False
+        self.iou_aware_factor = 0.5
 
 
 class TestYoloBoxOpScaleXY(TestYoloBoxOp):
@@ -151,19 +167,36 @@ def initTestCase(self):
         self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 13, 13)
         self.imgsize_shape = (self.batch_size, 2)
         self.scale_x_y = 1.2
+        self.iou_aware = False
+        self.iou_aware_factor = 0.5
+
+
+class TestYoloBoxOpIoUAware(TestYoloBoxOp):
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int(len(self.anchors) // 2)
+        self.batch_size = 32
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.clip_bbox = True
+        self.x_shape = (self.batch_size, an_num * (6 + self.class_num), 13, 13)
+        self.imgsize_shape = (self.batch_size, 2)
+        self.scale_x_y = 1.
+        self.iou_aware = True
+        self.iou_aware_factor = 0.5
 
 
 class TestYoloBoxDygraph(unittest.TestCase):
     def test_dygraph(self):
         paddle.disable_static()
-        x = np.random.random([2, 14, 8, 8]).astype('float32')
         img_size = np.ones((2, 2)).astype('int32')
-
-        x = paddle.to_tensor(x)
         img_size = paddle.to_tensor(img_size)
 
+        x1 = np.random.random([2, 14, 8, 8]).astype('float32')
+        x1 = paddle.to_tensor(x1)
         boxes, scores = paddle.vision.ops.yolo_box(
-            x,
+            x1,
             img_size=img_size,
             anchors=[10, 13, 16, 30],
             class_num=2,
@@ -172,16 +205,30 @@ def test_dygraph(self):
             clip_bbox=True,
             scale_x_y=1.)
         assert boxes is not None and scores is not None
+
+        x2 = np.random.random([2, 16, 8, 8]).astype('float32')
+        x2 = paddle.to_tensor(x2)
+        boxes, scores = paddle.vision.ops.yolo_box(
+            x2,
+            img_size=img_size,
+            anchors=[10, 13, 16, 30],
+            class_num=2,
+            conf_thresh=0.01,
+            downsample_ratio=8,
+            clip_bbox=True,
+            scale_x_y=1.,
+            iou_aware=True,
+            iou_aware_factor=0.5)
         paddle.enable_static()
 
 
 class TestYoloBoxStatic(unittest.TestCase):
     def test_static(self):
-        x = paddle.static.data('x', [2, 14, 8, 8], 'float32')
+        x1 = paddle.static.data('x1', [2, 14, 8, 8], 'float32')
         img_size = paddle.static.data('img_size', [2, 2], 'int32')
 
         boxes, scores = paddle.vision.ops.yolo_box(
-            x,
+            x1,
             img_size=img_size,
             anchors=[10, 13, 16, 30],
             class_num=2,
@@ -191,6 +238,20 @@ def test_static(self):
             scale_x_y=1.)
         assert boxes is not None and scores is not None
 
+        x2 = paddle.static.data('x2', [2, 16, 8, 8], 'float32')
+        boxes, scores = paddle.vision.ops.yolo_box(
+            x2,
+            img_size=img_size,
+            anchors=[10, 13, 16, 30],
+            class_num=2,
+            conf_thresh=0.01,
+            downsample_ratio=8,
+            clip_bbox=True,
+            scale_x_y=1.,
+            iou_aware=True,
+            iou_aware_factor=0.5)
+        assert boxes is not None and scores is not None
+
 
 class TestYoloBoxOpHW(TestYoloBoxOp):
     def initTestCase(self):
@@ -204,6 +265,8 @@ def initTestCase(self):
         self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 13, 9)
         self.imgsize_shape = (self.batch_size, 2)
         self.scale_x_y = 1.
+        self.iou_aware = False
+        self.iou_aware_factor = 0.5
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
index bebb5c76264914..53a91af3a716ba 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_pool2d_op_xpu.py
@@ -25,6 +25,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 import paddle
+from test_pool2d_op import adaptive_start_index, adaptive_end_index
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py
index 989db9efea119d..92a900e6c37158 100644
--- a/python/paddle/fluid/trainer_desc.py
+++ b/python/paddle/fluid/trainer_desc.py
@@ -17,7 +17,7 @@
 import os
 __all__ = [
     'TrainerDesc', 'MultiTrainer', 'DistMultiTrainer', 'PipelineTrainer',
-    'HeterXpuTrainer', 'HeterBoxWorker'
+    'HeterXpuTrainer', 'HeterBoxTrainer'
 ]
 
 
diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py
index 00dea8d1251f4b..95379a34c22144 100644
--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
@@ -27,7 +27,7 @@
 from .framework import Variable
 from multiprocessing import Process, Manager
 
-__all__ = ["TrainerFactory", "FetchHandler", "FetchHandlerMonitor"]
+__all__ = ["TrainerFactory", "FetchHandlerMonitor"]
 
 
 class TrainerFactory(object):
diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py
index ef6975c3d241e5..308a876977cf4f 100644
--- a/python/paddle/fluid/transpiler/collective.py
+++ b/python/paddle/fluid/transpiler/collective.py
@@ -434,9 +434,10 @@ def _transpile_startup_program(self):
             print("total endpoints: ", self.endpoints)
             print("rank: %d, ring_id: %d" % (self.rank, self.nrings))
             for ring_id in range(self.nrings):
-                self._init_communicator(
-                    self.startup_program, self.current_endpoint, self.endpoints,
-                    self.rank, ring_id, self.wait_port, True)
+                self._init_communicator(self.startup_program,
+                                        self.current_endpoint, self.endpoints,
+                                        self.rank, ring_id, self.wait_port)
+
         else:
             print("begin to _transpile_startup_program for single-node")
             block = self.startup_program.global_block()
diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
index 242b5b14db2bcc..c9363dff13d81c 100644
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
@@ -50,6 +50,17 @@ def replace_ellipsis(var, item):
     return item
 
 
+def replace_none(item):
+    new_item = []
+    none_axes = []
+    for i, slice_item in enumerate(item):
+        if slice_item is None:
+            none_axes.append(i)
+        else:
+            new_item.append(slice_item)
+    return new_item, none_axes
+
+
 def is_integer_or_scalar_tensor(ele):
     from .framework import Variable
     if isinstance(ele, int):
@@ -87,7 +98,7 @@ def _getitem_impl_(var, item):
     Returns:
         Sliced variable
     """
-    from .framework import default_main_program
+    from .framework import default_main_program, Variable
 
     if not isinstance(item, tuple):
         item = (item, )
@@ -97,12 +108,27 @@ def _getitem_impl_(var, item):
     starts = []
     ends = []
     steps = []
-    reverse_axis = []
+    reverse_axes = []
 
     use_strided_slice = False
+    item, none_axes = replace_none(item)
+    item = replace_ellipsis(var, item)
 
     for dim, slice_item in enumerate(item):
         if is_integer_or_scalar_tensor(slice_item):
+            if isinstance(slice_item,
+                          int) and var.shape[dim] is not None and var.shape[
+                              dim] >= 0 and slice_item >= var.shape[dim]:
+                # For python, if users write a, b = var, the __getitem__
+                # method will iterate through 0, 1, 2 ... until __getitem__
+                # throws an IndexError, then stop. The var[0], var[1] will
+                # be given to a, b respectively. If more values are given,
+                # the unpack size would cause error.
+                #
+                # We raises IndexError here to support grammar like `a, b = var`
+                raise IndexError(
+                    "slice_item %d at dim %d should be >= 0 and < var.shape[%d]: %d"
+                    % (slice_item, dim, dim, var.shape[dim]))
             decrease_axes.append(dim)
             start = slice_item
             step = 1
@@ -120,12 +146,54 @@ def _getitem_impl_(var, item):
 
             if start is None and end is None:
                 assert (step == -1)
-                reverse_axis.append(dim)
+                reverse_axes.append(dim)
                 continue
 
             start = 0 if start is None else start
             end = MAX_INTEGER if end is None else end
 
+        elif isinstance(slice_item, list):
+            is_bool_list = False
+            for i in slice_item:
+                if not isinstance(i, (int, bool)):
+                    raise TypeError("Only support int or bool in index list.")
+
+                if isinstance(i, bool):
+                    is_bool_list = True
+                    break
+
+            if len(item) != 1:
+                raise IndexError(
+                    "When index contains a list, its length must be 1, but received {}".
+                    format(len(item)))
+
+            if is_bool_list:
+                new_slice_item = []
+                for idx, ele in enumerate(slice_item):
+                    if not isinstance(ele, bool):
+                        raise TypeError(
+                            "Mixed bool index with other types is not supported."
+                        )
+
+                    if ele is True:
+                        new_slice_item.append(idx)
+                slice_item = new_slice_item
+
+            from .layers import assign
+            from ..tensor import index_select
+
+            idx = assign(np.array(slice_item).astype("int32"))
+            return index_select(var, index=idx, axis=0)
+
+        elif isinstance(slice_item, Variable):
+            if len(item) != 1:
+                raise IndexError(
+                    "When index contains a Tensor, its length must be 1, but received {}".
+                    format(len(item)))
+
+            from ..tensor import index_select
+            return index_select(var, index=slice_item, axis=0)
+
         else:
             raise IndexError(
                 "Valid index accept int or slice or ellipsis, but received {}.".
@@ -170,9 +238,38 @@ def _getitem_impl_(var, item):
             attrs=attrs)
         out = slice_out_var
 
-    if len(reverse_axis) > 0:
+    if len(reverse_axes) > 0:
         from .layers.tensor import reverse
-        out = reverse(out, axis=reverse_axis)
+        out = reverse(out, axis=reverse_axes)
+
+    # Deal with cases when all axes are decreased.
+    # After slice, the shape of out is [1], which should have been [], but Paddle doesn't support scalar.
+    # In order to ensure the correctness of the final shape of out, one dimension of out needs to be decreased.
+    # For example:
+    # # x.shape: (2,3,4)
+    # out = x[0, 1, 1, None] # out.shape : (1)
+    if len(decrease_axes) == len(var.shape):
+        none_axes = none_axes[1:]
+
+    if len(none_axes) > 0:
+        # Deal with cases that decrease_axes is not empty
+        # For example:
+        # # x.shape: (2,3,4)
+        # out = x[0, 0:2, None] # out.shape : (2, 1, 4)
+        for idx, axis in enumerate(none_axes):
+            l = len([i for i in decrease_axes if i < axis])
+            new_axis = axis - l
+            none_axes[idx] = new_axis
+
+        # Deal with cases when all axes are decreased.
+        # After slice, the shape of out is [1], which should have been [], but Paddle doesn't support scalar.
+        # In order to ensure the correctness of the final shape of out, one dimension of out needs to be decreased.
+        # For example:
+        # # x.shape: (2,3,4)
+        # out = x[0, 1, 1, None] # out.shape : (1)
+
+        from ..tensor import unsqueeze
+        out = unsqueeze(out, axis=none_axes)
 
     return out
 
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index 93056a60c371c3..e9d690c28d60ec 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -87,8 +87,6 @@ def get_default_dtype():
 @contextmanager
 def set_grad_enabled(mode):
     """
-    :api_attr: imperative
-
     Create a context which enables or disables dygraph gradient calculation.
 
     Args:
@@ -96,6 +94,7 @@ def set_grad_enabled(mode):
 
     Examples:
         .. code-block:: python
+            
             import paddle
             x = paddle.ones([3, 2])
             x.stop_gradient = False
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 1705db50d391a9..5f1ffa81eab17b 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -32,6 +32,7 @@
 from paddle.fluid import core
 from paddle.fluid.io import _unpack_saved_dict, _pack_loaded_dict, _pickle_loads_mac
 from paddle.fluid.io import _legacy_save as _legacy_static_save
+from paddle.fluid.io import _open_file_buffer, _is_file_path, _is_memory_buffer
 
 from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, in_dygraph_mode, ParamBase, _current_expected_place, Program
 from paddle.fluid.dygraph.jit import _SaveLoadConfig
@@ -450,30 +451,81 @@ def ndarray_to_tensor(obj):
 def _save_lod_tensor(tensor, file_name):
     if not tensor._is_initialized():
         raise ValueError("The saved tensor is not initialized.")
-    _seek = core._save_lod_tensor(tensor, file_name)
-    # '_seek' is the end position of this tensor in the file.
+    if _is_file_path(file_name):
+        _seek = core.save_lod_tensor(tensor, file_name)
+        # '_seek' is the end position of this tensor in the file.
+
+    elif _is_memory_buffer(file_name):
+        tensor_bytes = core.save_lod_tensor_to_memory(tensor)
+
+        with _open_file_buffer(file_name, 'wb') as f:
+            f.write(tensor_bytes)
+            _seek = f.tell()
+
+    else:
+        raise NotImplementedError(
+            'Only supports saving objects to file or BytesIO, but received {}'.
+            format(type(file_name)))
     return _seek
 
 
 def _load_lod_tensor(file_name):
     temp_t = paddle.fluid.core.LoDTensor()
-    # '_seek' is the end position of this tensor in the file.
-    _seek = paddle.fluid.core._load_lod_tensor(temp_t, file_name)
+    if _is_file_path(file_name):
+        # '_seek' is the end position of this tensor in the file.
+        _seek = paddle.fluid.core.load_lod_tensor(temp_t, file_name)
+
+    elif _is_memory_buffer(file_name):
+        with _open_file_buffer(file_name, 'rb') as f:
+            tensor_bytes = f.read()
+            paddle.fluid.core.load_lod_tensor_from_memory(temp_t, tensor_bytes)
+            _seek = f.tell()
+
+    else:
+        raise NotImplementedError(
+            'Only supports load objects from file or BytesIO, but received {}'.
+            format(type(file_name)))
+
     return temp_t, _seek
 
 
 def _save_selected_rows(selected_rows, file_name):
-    # '_seek' is the end position of this SelectedRows in the file.
     if not selected_rows.get_tensor()._is_initialized():
         raise ValueError("The saved tensor is not initialized.")
-    _seek = core._save_selected_rows(selected_rows, file_name)
+    if _is_file_path(file_name):
+        # '_seek' is the end position of this SelectedRows in the file.
+        _seek = core.save_selected_rows(selected_rows, file_name)
+
+    elif _is_memory_buffer(file_name):
+        selected_rows_bytes = core.save_selected_rows_to_memory(selected_rows)
+        with _open_file_buffer(file_name, 'wb') as f:
+            f.write(selected_rows_bytes)
+            _seek = f.tell()
+    else:
+        raise NotImplementedError(
+            'Only supports saving objects to file or BytesIO, but received {}'.
+            format(type(file_name)))
     return _seek
 
 
 def _load_selected_rows(file_name):
     temp_sr = core.SelectedRows()
-    # '_seek' is the end position of this SelectedRows in the file.
-    _seek = core._load_selected_rows(temp_sr, file_name)
+    if _is_file_path(file_name):
+        # '_seek' is the end position of this SelectedRows in the file.
+        _seek = core.load_selected_rows(temp_sr, file_name)
+
+    elif _is_memory_buffer(file_name):
+        with _open_file_buffer(file_name, 'rb') as f:
+            selected_rows_bytes = f.read()
+            paddle.fluid.core.load_selected_rows_from_memory(
+                temp_sr, selected_rows_bytes)
+        _seek = f.tell()
+
+    else:
+        raise NotImplementedError(
+            'Only supports load objects from file or BytesIO, but received {}'.
+            format(type(file_name)))
+
     return temp_sr, _seek
 
 
@@ -509,7 +561,7 @@ def save(obj, path, protocol=4, **configs):
     
     Args:
         obj(Object) : The object to be saved.
-        path(str) : The path of the object to be saved. 
+        path(str|BytesIO) : The path/buffer of the object to be saved. 
           If saved in the current directory, the input path string will be used as the file name. 
         protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
                                  Default: 4
@@ -593,18 +645,39 @@ def save(obj, path, protocol=4, **configs):
             main_program = paddle.static.default_main_program()
             path = "example/main_program.pdmodel"
             paddle.save(main_program, path)
-    '''
-    # 1. input check
-    filename = os.path.basename(path)
-    if filename == "":
-        raise ValueError("The input path MUST be format of dirname/filename "
-                         "[dirname\\filename in Windows system], but received "
-                         "filename is empty string.")
 
-    # 2. save object
-    dirname = os.path.dirname(path)
-    if dirname and not os.path.exists(dirname):
-        os.makedirs(dirname)
+
+            # example 5: save object to memory
+            from io import BytesIO
+            import paddle
+            from paddle.nn import Linear
+            paddle.disable_static()
+
+            linear = Linear(5, 10)
+            state_dict = linear.state_dict()
+            byio = BytesIO()
+            paddle.save(state_dict, byio)
+            tensor = paddle.randn([2, 3], dtype='float32')
+            paddle.save(tensor, byio)
+    
+    '''
+    if _is_file_path(path):
+        # 1. input check
+        filename = os.path.basename(path)
+        if filename == "":
+            raise ValueError(
+                "The input path MUST be format of dirname/filename "
+                "[dirname\\filename in Windows system], but received "
+                "filename is empty string.")
+
+        # 2. save object
+        dirname = os.path.dirname(path)
+        if dirname and not os.path.exists(dirname):
+            os.makedirs(dirname)
+    elif not _is_memory_buffer(path):
+        raise ValueError(
+            "only supports saving objects to file and `BytesIO`, but got {}".
+            format(type(path)))
 
     config = _parse_save_config(configs)
 
@@ -625,7 +698,7 @@ def save(obj, path, protocol=4, **configs):
 
         if isinstance(obj, Program):
             obj.desc.flush()
-            with open(path, "wb") as f:
+            with _open_file_buffer(path, "wb") as f:
                 f.write(obj.desc.serialize_to_string())
 
         elif _is_state_dict(obj):
@@ -634,7 +707,7 @@ def save(obj, path, protocol=4, **configs):
             else:
                 _legacy_static_save(obj, path, protocol)
         else:
-            with open(path, 'wb') as f:
+            with _open_file_buffer(path, 'wb') as f:
                 _pickle_save(obj, f, protocol)
 
 
@@ -648,12 +721,6 @@ def _legacy_save(obj, path, protocol=2):
     if len(obj) == 0:
         warnings.warn("The input state dict is empty, no need to save.")
 
-    filename = os.path.basename(path)
-    if filename == "":
-        raise ValueError("The input path MUST be format of dirname/filename "
-                         "[dirname\\filename in Windows system], but received "
-                         "filename is empty string.")
-
     if not isinstance(protocol, int):
         raise ValueError("The 'protocol' MUST be `int`, but received {}".format(
             type(protocol)))
@@ -662,26 +729,33 @@ def _legacy_save(obj, path, protocol=2):
         raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
                          format(protocol))
 
-    # 2. save object
-    dirname = os.path.dirname(path)
-    if dirname and not os.path.exists(dirname):
-        os.makedirs(dirname)
+    if _is_file_path(path):
+        filename = os.path.basename(path)
+        if filename == "":
+            raise ValueError(
+                "The input path MUST be format of dirname/filename "
+                "[dirname\\filename in Windows system], but received "
+                "filename is empty string.")
+        # 2. save object
+        dirname = os.path.dirname(path)
+        if dirname and not os.path.exists(dirname):
+            os.makedirs(dirname)
 
-    # TODO(chenweihang): supports save other object
     if isinstance(obj, dict):
         saved_obj = _build_saved_state_dict(obj)
 
     saved_obj = _unpack_saved_dict(saved_obj, protocol)
 
     # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
-    if sys.platform == 'darwin' and sys.version_info.major == 3:
+    if _is_file_path(
+            path) and sys.platform == 'darwin' and sys.version_info.major == 3:
         pickle_bytes = pickle.dumps(saved_obj, protocol=protocol)
         with open(path, 'wb') as f:
             max_bytes = 2**30
             for i in range(0, len(pickle_bytes), max_bytes):
                 f.write(pickle_bytes[i:i + max_bytes])
     else:
-        with open(path, 'wb') as f:
+        with _open_file_buffer(path, 'wb') as f:
             pickle.dump(saved_obj, f, protocol=protocol)
 
 
@@ -716,7 +790,7 @@ def load(path, **configs):
         ``Layer.set_state_dict`` later.
 
     Args:
-        path(str) : The path to load the target object. Generally, the path is the target 
+        path(str|BytesIO) : The path/buffer to load the target object. Generally, the path is the target 
             file path. When loading state_dict from the saved result of the API used to save 
             the inference model, the path may be a file prefix or directory.
         **configs (dict, optional): other load configuration options for compatibility. We do not 
@@ -822,18 +896,36 @@ def load(path, **configs):
             print(load_main)
 
 
+            # example 5: save object to memory
+            from io import BytesIO
+            import paddle
+            from paddle.nn import Linear
+            paddle.disable_static()
+
+            linear = Linear(5, 10)
+            state_dict = linear.state_dict()
+            byio = BytesIO()
+            paddle.save(state_dict, byio)
+            tensor = paddle.randn([2, 3], dtype='float32')
+            paddle.save(tensor, byio)
+            byio.seek(0)
+            # load state_dict
+            dict_load = paddle.load(byio)
+
     '''
 
-    if os.path.isfile(path):
+    if _is_memory_buffer(path) or os.path.isfile(path):
         config = _parse_load_config(configs)
         if six.PY2:
             exception_type = KeyError
         else:
             exception_type = pickle.UnpicklingError
         try:
-            with open(path, 'rb') as f:
+            with _open_file_buffer(path, 'rb') as f:
                 # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
-                if sys.platform == 'darwin' and sys.version_info.major == 3:
+                if _is_file_path(
+                        path
+                ) and sys.platform == 'darwin' and sys.version_info.major == 3:
                     load_result = _pickle_loads_mac(path, f)
                 else:
                     load_result = pickle.load(f) if six.PY2 else pickle.load(
@@ -875,7 +967,7 @@ def load(path, **configs):
                         return tensor
                 except:
                     try:
-                        with open(path, "rb") as f:
+                        with _open_file_buffer(path, "rb") as f:
                             program_desc_str = f.read()
                             program = Program.parse_from_string(
                                 program_desc_str)
@@ -895,9 +987,9 @@ def _legacy_load(path, **configs):
     load_result = None
     config = _parse_load_config(configs)
 
-    if os.path.isfile(path):
+    if os.path.isfile(path) or _is_memory_buffer(path):
         # we think path is file means this file is created by paddle.save
-        with open(path, 'rb') as f:
+        with _open_file_buffer(path, 'rb') as f:
             load_result = pickle.load(f) if six.PY2 else pickle.load(
                 f, encoding='latin1')
         load_result = _pack_loaded_dict(load_result)
diff --git a/python/paddle/hapi/hub.py b/python/paddle/hapi/hub.py
index 243bd79c191dd6..b491bc0271bec7 100644
--- a/python/paddle/hapi/hub.py
+++ b/python/paddle/hapi/hub.py
@@ -110,7 +110,11 @@ def _get_cache_or_reload(repo, force_reload, verbose=True, source='github'):
         url = _git_archive_link(repo_owner, repo_name, branch, source=source)
 
         fpath = get_path_from_url(
-            url, hub_dir, check_exist=not force_reload, decompress=False)
+            url,
+            hub_dir,
+            check_exist=not force_reload,
+            decompress=False,
+            method=('wget' if source == 'gitee' else 'get'))
         shutil.move(fpath, cached_file)
 
         with zipfile.ZipFile(cached_file) as cached_zipfile:
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 160d6c54759d90..e53ab12f841806 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -163,7 +163,7 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
             })
     elif core.is_compiled_with_npu():
         hccl_id_var = block.create_var(
-            name=unique_name.generate('hccl_id'),
+            name=fluid.unique_name.generate('hccl_id'),
             persistable=True,
             type=core.VarDesc.VarType.RAW)
         endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
@@ -710,10 +710,10 @@ def train_batch(self, inputs, labels=None):
                 enable=self._amp_level != 'O0', **self._amp_custom_lists):
             if self._nranks > 1:
                 outputs = self.ddp_model.forward(
-                    * [to_variable(x) for x in inputs])
+                    *[to_variable(x) for x in inputs])
             else:
                 outputs = self.model.network.forward(
-                    * [to_variable(x) for x in inputs])
+                    *[to_variable(x) for x in inputs])
 
             losses = self.model._loss(*(to_list(outputs) + labels))
             losses = to_list(losses)
@@ -732,7 +732,7 @@ def train_batch(self, inputs, labels=None):
         metrics = []
         for metric in self.model._metrics:
             metric_outs = metric.compute(*(to_list(outputs) + labels))
-            m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
+            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
             metrics.append(m)
 
         return ([to_numpy(l) for l in losses], metrics) \
@@ -746,7 +746,7 @@ def eval_batch(self, inputs, labels=None):
         labels = labels or []
         labels = [to_variable(l) for l in to_list(labels)]
 
-        outputs = self.model.network.forward(* [to_variable(x) for x in inputs])
+        outputs = self.model.network.forward(*[to_variable(x) for x in inputs])
         if self.model._loss:
             losses = self.model._loss(*(to_list(outputs) + labels))
             losses = to_list(losses)
@@ -777,7 +777,7 @@ def eval_batch(self, inputs, labels=None):
                     self._merge_count[self.mode + '_batch'] = samples
 
             metric_outs = metric.compute(*(to_list(outputs) + labels))
-            m = metric.update(* [to_numpy(m) for m in to_list(metric_outs)])
+            m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
             metrics.append(m)
 
         if self.model._loss and len(metrics):
@@ -1831,6 +1831,7 @@ def predict(self,
                 batch_size=1,
                 num_workers=0,
                 stack_outputs=False,
+                verbose=1,
                 callbacks=None):
         """
         Compute the output predictions on testing data.
@@ -1851,7 +1852,10 @@ def predict(self,
                 be a length N list in shape [[X, Y], [X, Y], ....[X, Y]] if stack_outputs
                 is False. stack_outputs as False is used for LoDTensor output situation,
                 it is recommended set as True if outputs contains no LoDTensor. Default: False.
+            verbose (int): The verbosity mode, should be 0, 1, or 2. 0 = silent,
+                1 = progress bar, 2 = one line per batch. Default: 1.
             callbacks(Callback): A Callback instance, default None.
+
         Returns:
             list: output of models.
 
@@ -1911,7 +1915,7 @@ def __len__(self):
 
         self._test_dataloader = test_loader
 
-        cbks = config_callbacks(callbacks, model=self, verbose=1)
+        cbks = config_callbacks(callbacks, model=self, verbose=verbose)
 
         test_steps = self._len_data_loader(test_loader)
         logs = {'steps': test_steps}
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index d78196d94451ed..93f1a5a37a67f1 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -80,6 +80,23 @@ def forward(self, inputs):
             params_info = paddle.summary(lenet, (1, 1, 28, 28))
             print(params_info)
 
+            # multi input demo
+            class LeNetMultiInput(LeNet):
+
+                def forward(self, inputs, y):
+                    x = self.features(inputs)
+
+                    if self.num_classes > 0:
+                        x = paddle.flatten(x, 1)
+                        x = self.fc(x + y)
+                    return x
+            
+            lenet_multi_input = LeNetMultiInput()
+
+            params_info = paddle.summary(lenet_multi_input, [(1, 1, 28, 28), (1, 400)], 
+                                        ['float32', 'float32'])
+            print(params_info)
+
     """
     if isinstance(input_size, InputSpec):
         _input_size = tuple(input_size.shape)
diff --git a/python/paddle/hapi/progressbar.py b/python/paddle/hapi/progressbar.py
index 5f63a3169f8ac7..6ed33f4f960b40 100644
--- a/python/paddle/hapi/progressbar.py
+++ b/python/paddle/hapi/progressbar.py
@@ -33,7 +33,8 @@ def __init__(self,
                  width=30,
                  verbose=1,
                  start=True,
-                 file=sys.stdout):
+                 file=sys.stdout,
+                 name='step'):
         self._num = num
         if isinstance(num, int) and num <= 0:
             raise TypeError('num should be None or integer (> 0)')
@@ -47,6 +48,7 @@ def __init__(self,
         if start:
             self._start = time.time()
         self._last_update = 0
+        self.name = name
 
         self._dynamic_display = (
             (hasattr(self.file, 'isatty') and
@@ -74,7 +76,7 @@ def start(self):
         self.file.flush()
         self._start = time.time()
 
-    def update(self, current_num, values=None):
+    def update(self, current_num, values={}):
         now = time.time()
 
         if current_num:
@@ -83,11 +85,11 @@ def update(self, current_num, values=None):
             time_per_unit = 0
 
         if time_per_unit >= 1 or time_per_unit == 0:
-            fps = ' - %.0fs/%s' % (time_per_unit, 'step')
+            fps = ' - %.0fs/%s' % (time_per_unit, self.name)
         elif time_per_unit >= 1e-3:
-            fps = ' - %.0fms/%s' % (time_per_unit * 1e3, 'step')
+            fps = ' - %.0fms/%s' % (time_per_unit * 1e3, self.name)
         else:
-            fps = ' - %.0fus/%s' % (time_per_unit * 1e6, 'step')
+            fps = ' - %.0fus/%s' % (time_per_unit * 1e6, self.name)
 
         info = ''
         if self._verbose == 1:
@@ -102,7 +104,7 @@ def update(self, current_num, values=None):
             if self._num is not None:
                 numdigits = int(np.log10(self._num)) + 1
 
-                bar_chars = ('step %' + str(numdigits) + 'd/%d [') % (
+                bar_chars = (self.name + ' %' + str(numdigits) + 'd/%d [') % (
                     current_num, self._num)
                 prog = float(current_num) / self._num
                 prog_width = int(self._width * prog)
@@ -116,7 +118,7 @@ def update(self, current_num, values=None):
                 bar_chars += ('.' * (self._width - prog_width))
                 bar_chars += ']'
             else:
-                bar_chars = 'step %3d' % current_num
+                bar_chars = self.name + ' %3d' % current_num
 
             self._total_width = len(bar_chars)
             sys.stdout.write(bar_chars)
@@ -162,10 +164,10 @@ def update(self, current_num, values=None):
         elif self._verbose == 2 or self._verbose == 3:
             if self._num:
                 numdigits = int(np.log10(self._num)) + 1
-                count = ('step %' + str(numdigits) + 'd/%d') % (current_num,
-                                                                self._num)
+                count = (self.name + ' %' + str(numdigits) + 'd/%d') % (
+                    current_num, self._num)
             else:
-                count = 'step %3d' % current_num
+                count = self.name + ' %3d' % current_num
             info = count + info
 
             for k, val in values:
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 7cf3f94872de17..da31cc0239f88f 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -138,6 +138,7 @@
 from . import utils  # noqa: F401
 from . import functional  # noqa: F401
 from . import initializer  # noqa: F401
+from . import quant  # noqa: F401
 
 #TODO: remove 'diag_embed', 'remove_weight_norm', 'weight_norm' months later.
 import paddle.utils.deprecated as deprecated
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index e7e36ca7a3a1a7..57ce6c78e958f8 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1446,7 +1446,9 @@ def linear(x, weight, bias=None, name=None):
           #     [2.1077576  2.1077576  2.1077576  2.1077576 ]]
     """
     if in_dygraph_mode():
-        pre_bias = core.ops.matmul_v2(x, weight)
+        pre_bias = _varbase_creator(dtype=x.dtype)
+        core.ops.matmul(x, weight, pre_bias, 'transpose_X', False,
+                        'transpose_Y', False, "alpha", 1)
 
         if bias is None:
             return pre_bias
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 67958b8683fe17..66913f3ad2f659 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
+from paddle.fluid.framework import _global_flags
 
 import numpy as np
 from ...device import get_cudnn_version
@@ -537,7 +538,7 @@ def conv2d(x,
     use_cudnn = True if (core.is_compiled_with_cuda() and
                          cudnn_version is not None) else False
 
-    use_mkldnn = core.globals()["FLAGS_use_mkldnn"]
+    use_mkldnn = _global_flags()["FLAGS_use_mkldnn"]
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index eecea3034a752e..fc98157273447f 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -98,7 +98,7 @@ def __init__(self,
                                                   'kernel_size')
         self._padding = padding
         self._padding_mode = padding_mode
-        self.output_padding = output_padding
+        self._output_padding = output_padding
         if dims != 1:
             self._updated_padding, self._padding_algorithm = _update_padding_nd(
                 padding, channel_last, dims)
@@ -163,7 +163,7 @@ def extra_repr(self):
             main_str += ', padding={_padding}'
         if self._padding_mode is not 'zeros':
             main_str += ', padding_mode={_padding_mode}'
-        if self.output_padding != 0:
+        if self._output_padding != 0:
             main_str += ', output_padding={_output_padding}'
         if self._dilation != [1] * len(self._dilation):
             main_str += ', dilation={_dilation}'
@@ -508,7 +508,7 @@ def forward(self, x, output_size=None):
             self.weight,
             bias=self.bias,
             output_size=output_size,
-            output_padding=self.output_padding,
+            output_padding=self._output_padding,
             padding=self._padding,
             stride=self._stride,
             dilation=self._dilation,
@@ -824,7 +824,7 @@ def __init__(self,
 
     def forward(self, x, output_size=None):
         if output_size is None:
-            output_padding = self.output_padding
+            output_padding = self._output_padding
         else:
             output_padding = 0
 
@@ -1161,7 +1161,7 @@ def __init__(self,
 
     def forward(self, x, output_size=None):
         if output_size is None:
-            output_padding = self.output_padding
+            output_padding = self._output_padding
         else:
             output_padding = 0
 
diff --git a/python/paddle/nn/quant/__init__.py b/python/paddle/nn/quant/__init__.py
new file mode 100644
index 00000000000000..c7f9a5073def83
--- /dev/null
+++ b/python/paddle/nn/quant/__init__.py
@@ -0,0 +1,25 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .functional_layers import FloatFunctionalLayer  # noqa: F401
+from .functional_layers import add  # noqa: F401
+from .functional_layers import subtract  # noqa: F401
+from .functional_layers import multiply  # noqa: F401
+from .functional_layers import divide  # noqa: F401
+from .functional_layers import reshape  # noqa: F401
+from .functional_layers import transpose  # noqa: F401
+from .functional_layers import concat  # noqa: F401
+from .functional_layers import flatten  # noqa: F401
+
+__all__ = []
diff --git a/python/paddle/nn/quant/functional_layers.py b/python/paddle/nn/quant/functional_layers.py
new file mode 100644
index 00000000000000..ce5fb3e616eb59
--- /dev/null
+++ b/python/paddle/nn/quant/functional_layers.py
@@ -0,0 +1,87 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...fluid.dygraph import layers
+from ...tensor import math, manipulation
+
+__all__ = []
+
+
+class FloatFunctionalLayer(layers.Layer):
+    def __init__(self):
+        super(FloatFunctionalLayer, self).__init__()
+
+
+class add(FloatFunctionalLayer):
+    def __init__(self):
+        super(add, self).__init__()
+
+    def forward(self, x, y, name=None):
+        return math.add(x, y, name)
+
+
+class subtract(FloatFunctionalLayer):
+    def __init__(self):
+        super(subtract, self).__init__()
+
+    def forward(self, x, y, name=None):
+        return math.subtract(x, y, name)
+
+
+class multiply(FloatFunctionalLayer):
+    def __init__(self):
+        super(multiply, self).__init__()
+
+    def forward(self, x, y, name=None):
+        return math.multiply(x, y, name)
+
+
+class divide(FloatFunctionalLayer):
+    def __init__(self):
+        super(divide, self).__init__()
+
+    def forward(self, x, y, name=None):
+        return math.divide(x, y, name)
+
+
+class reshape(FloatFunctionalLayer):
+    def __init__(self):
+        super(reshape, self).__init__()
+
+    def forward(self, x, shape, name=None):
+        return manipulation.reshape(x, shape, name)
+
+
+class transpose(FloatFunctionalLayer):
+    def __init__(self):
+        super(transpose, self).__init__()
+
+    def forward(self, x, perm, name=None):
+        return manipulation.transpose(x, perm, name)
+
+
+class concat(FloatFunctionalLayer):
+    def __init__(self):
+        super(concat, self).__init__()
+
+    def forward(self, x, axis=0, name=None):
+        return manipulation.concat(x, axis, name)
+
+
+class flatten(FloatFunctionalLayer):
+    def __init__(self):
+        super(flatten, self).__init__()
+
+    def forward(self, x, start_axis=0, stop_axis=-1, name=None):
+        return manipulation.flatten(x, start_axis, stop_axis, name)
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 7da933a9b72798..db4e80d8d9a59b 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -1349,7 +1349,7 @@ def step(self, metrics, epoch=None):
         if isinstance(metrics, (Tensor, numpy.ndarray)):
             assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \
                 "should be (1L,), but the current metrics.shape is {}. Maybe that "  \
-                "you should call paddle.mean to process it first.".format(loss.shape)
+                "you should call paddle.mean to process it first.".format(metrics.shape)
         elif not isinstance(metrics,
                             (int, float, numpy.float32, numpy.float64)):
             raise TypeError(
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index faff090bcb1f4e..85c5c60a34c500 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -252,6 +252,19 @@ def _create_accumulators(self, block, parameters):
                 )
             self._add_accumulator(self._velocity_acc_str, p)
 
+    def _create_regularization_of_grad(self, param, grad, regularization=None):
+        """ Create and add backward regularization Operators
+    
+        Function helper of append_regularization_ops.
+        """
+        # If ParamAttr is set to L2Decay, we skip doing regularization here. And then we fused
+        # L2Decay with momentum which can refer to _append_optimize_op below.
+        if hasattr(param, 'regularizer') and isinstance(param.regularizer,
+                                                        L2DecayRegularizer):
+            return grad
+        return super(Momentum, self)._create_regularization_of_grad(
+            param, grad, regularization)
+
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
         if isinstance(param_and_grad, dict):
@@ -261,6 +274,20 @@ def _append_optimize_op(self, block, param_and_grad):
                                              param_and_grad[0])
         lr = self._create_param_lr(param_and_grad)
 
+        # For fusion of momentum and l2decay 
+        param = param_and_grad[0]
+        regularization_method = self._regularization_method
+        regularization_coeff = self._regularization_coeff
+        if hasattr(param, 'regularizer'):
+            # we skip param's l2decay before, so fuse it with momentum here.
+            if isinstance(param.regularizer, L2DecayRegularizer):
+                regularization_method = "l2_decay"
+                regularization_coeff = param.regularizer._regularization_coeff
+            # the param's regularization has been done before, we avoid do l2decay in momentum.
+            elif param.regularizer is not None:
+                regularization_method = ""
+                regularization_coeff = 0
+
         if framework.in_dygraph_mode():
             if isinstance(param_and_grad, dict):
                 self._update_regularization(param_and_grad['weight_decay'])
@@ -268,8 +295,8 @@ def _append_optimize_op(self, block, param_and_grad):
                 param_and_grad[0], param_and_grad[1], velocity_acc, lr,
                 param_and_grad[0], velocity_acc, 'mu', self._momentum,
                 'use_nesterov', self._use_nesterov, 'regularization_method',
-                self._regularization_method, 'regularization_coeff',
-                self._regularization_coeff)
+                regularization_method, 'regularization_coeff',
+                regularization_coeff)
             return None
 
         find_master = self._multi_precision and param_and_grad[
@@ -280,8 +307,8 @@ def _append_optimize_op(self, block, param_and_grad):
         attrs = {
             "mu": self._momentum,
             "use_nesterov": self._use_nesterov,
-            "regularization_method": self._regularization_method,
-            "regularization_coeff": self._regularization_coeff,
+            "regularization_method": regularization_method,
+            "regularization_coeff": regularization_coeff,
             "multi_precision": find_master,
             "rescale_grad": self._rescale_grad
         }
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 0f22b920b17deb..93b618b7c9edc0 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -32,7 +32,6 @@
 from ..fluid.initializer import Constant
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.layers import ops
-from ..fluid.regularizer import append_regularization_ops
 from ..fluid.dygraph import base as imperative_base
 from ..fluid.dygraph import no_grad
 from paddle.fluid import core
@@ -310,11 +309,11 @@ def set_state_dict(self, state_dict):
 
                 assert model_np.shape == load_para_np.shape,  \
                                           "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
-                                                 item.name, model_np.shape, load_para_np.shape)
+                                                 model_np.name, model_np.shape, load_para_np.shape)
 
                 assert model_np.dtype == load_para_np.dtype, \
                                           "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
-                                                item.name, model_np.dtype, load_para_np.dtype)
+                                                model_np.name, model_np.dtype, load_para_np.dtype)
 
                 tensor.set(load_para_np, framework._current_expected_place())
 
@@ -850,8 +849,8 @@ def apply_gradients(self, params_grads):
             params_grads = append_gradient_clip_ops(params_grads)
 
         # Add regularization if any
-        params_grads = append_regularization_ops(params_grads,
-                                                 self.regularization)
+        params_grads = self.append_regularization_ops(params_grads,
+                                                      self.regularization)
 
         optimize_ops = self._create_optimization_pass(params_grads)
         return optimize_ops
@@ -874,7 +873,7 @@ def _apply_optimize(self, loss, startup_program, params_grads):
                 if isinstance(params_grads, list):
                     if self._grad_clip is not None:
                         params_grads = self._grad_clip(params_grads)
-                    params_grads = append_regularization_ops(
+                    params_grads = self.append_regularization_ops(
                         params_grads, self.regularization)
                 else:
                     grad_clip = params_grads['grad_clip']
@@ -882,7 +881,7 @@ def _apply_optimize(self, loss, startup_program, params_grads):
                         params_grads['params'] = grad_clip(params_grads[
                             'params'])
 
-                    params_grads['params'] = append_regularization_ops(
+                    params_grads['params'] = self.append_regularization_ops(
                         params_grads['params'], self.regularization)
                 optimize_ops = self._create_optimization_pass(params_grads)
         else:
@@ -891,6 +890,93 @@ def _apply_optimize(self, loss, startup_program, params_grads):
                 optimize_ops = self.apply_gradients(params_grads)
         return optimize_ops
 
+    def _create_regularization_of_grad(self, param, grad, regularization=None):
+        """ Create and add backward regularization Operators
+    
+        Function helper of append_regularization_ops.
+        """
+        # If no gradient or no regularization is specified,  then we don't need to do anything
+        if grad is None or ((not hasattr(param, 'regularizer') or
+                             (hasattr(param, 'regularizer') and
+                              param.regularizer is None)) and
+                            regularization is None):
+            return grad
+        regularization_term = None
+        if hasattr(param, 'regularizer') and param.regularizer is not None:
+            # Add variable for regularization term in grad block
+            regularization_term = param.regularizer(param, grad, grad.block)
+        elif regularization is not None:
+            regularization_term = regularization(param, grad, grad.block)
+
+        assert regularization_term is not None
+
+        new_grad = grad
+        if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
+            # FIXME(zcd): If the grad is SELECTED_ROWS, after regularization,
+            # the grad's type and name will be changed. But the gradient's name
+            # is used in ParallelExecutor Reduce mode, so I add a flag for
+            # the new_grad here.
+            new_grad = grad.block.create_var(
+                name=grad.name + core.kNewGradSuffix(),
+                dtype=param.dtype,
+                shape=param.shape,
+                lod_level=param.lod_level,
+                type=core.VarDesc.VarType.LOD_TENSOR)
+
+        inputs = {"X": [grad, regularization_term]}
+        outputs = {"Out": [new_grad]}
+        if framework.in_dygraph_mode():
+            new_grad = core.ops.sum([grad, regularization_term])
+        else:
+            grad.block.append_op(type='sum', inputs=inputs, outputs=outputs)
+
+        return new_grad
+
+    def append_regularization_ops(self,
+                                  parameters_and_grads,
+                                  regularization=None):
+        r"""Create and add backward regularization Operators
+    
+        Creates and adds backward regularization operators in the BlockDesc.
+        This will add gradients of the regularizer function to the gradients
+        of the parameters and return these modified gradients. This is the
+        same as implementing weight decay in optimizers for regularization.
+    
+        Args:
+            parameters_and_grads: A list of (parameters, gradients) pairs
+                                  that need to be regularized.
+            regularization: A global regularizer. If the parameter is not
+                            set. It will be applied with regularizer.
+    
+        Returns:
+            list[(Variable, Variable)]: list of (parameters, gradients) \
+            pair with the regularized gradient
+    
+        Raises:
+            Exception: Unknown regularization type
+        """
+        params_and_grads = []
+        if framework.in_dygraph_mode():
+            for param, grad in parameters_and_grads:
+                new_grad = self._create_regularization_of_grad(param, grad,
+                                                               regularization)
+                params_and_grads.append((param, new_grad))
+        else:
+            repeate_regularizer = False
+            with framework.name_scope('regularization'):
+                for param, grad in parameters_and_grads:
+                    if not repeate_regularizer and param.regularizer is not None and regularization is not None:
+                        repeate_regularizer = True
+                        logging.info(
+                            "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
+                            "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
+                            % regularization.__str__())
+                    with param.block.program._optimized_guard([param, grad]):
+                        new_grad = self._create_regularization_of_grad(
+                            param, grad, regularization)
+                        params_and_grads.append((param, new_grad))
+        return params_and_grads
+
     def _get_no_grad_set(self, loss, no_grad_set=None):
         no_grad_set = _get_no_grad_set_name(no_grad_set)
         parameters = loss.block.program.global_block().all_parameters()
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index da9749722e1329..66f971c59d7d5b 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -18,6 +18,7 @@
 import six
 import sys
 import warnings
+import logging
 
 from six.moves.queue import Queue
 from six.moves import zip_longest
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 89da75ae91e40e..688bff4a678f2a 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -85,6 +85,7 @@
            'load',
            'save_inference_model',
            'load_inference_model',
+           'normalize_program',
            'load_program_state',
            'set_program_state',
            'cpu_places',
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index 58e8ebc481d799..a9cae0c14e3b19 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -157,7 +157,7 @@ def normalize_program(program, feed_vars, fetch_vars):
             exe.run(paddle.static.default_startup_program())
 
             # normalize main program.
-            program = default_main_program()
+            program = paddle.static.default_main_program()
             normalized_program = paddle.static.normalize_program(program, [image], [predict])
 
     """
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index c8d80fc9bc68cb..596cd926231cc7 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -18,6 +18,7 @@
 from .attribute import imag  # noqa: F401
 from .creation import to_tensor  # noqa: F401
 from .creation import diag  # noqa: F401
+from .creation import diagflat  # noqa: F401
 from .creation import eye  # noqa: F401
 from .creation import linspace  # noqa: F401
 from .creation import ones  # noqa: F401
@@ -163,6 +164,8 @@
 from .math import any  # noqa: F401
 from .math import broadcast_shape  # noqa: F401
 from .math import conj  # noqa: F401
+from .math import neg  # noqa: F401
+from .math import lgamma  # noqa: F401
 
 from .random import multinomial  # noqa: F401
 from .random import standard_normal  # noqa: F401
@@ -280,6 +283,8 @@
            'isnan',
            'broadcast_shape',
            'conj',
+           'neg',
+           'lgamma',
            'equal',
            'equal_all',
            'greater_equal',
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index e1012e7656a3d3..b446a5921b0673 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -40,9 +40,8 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     Constructs a ``paddle.Tensor`` from ``data`` , 
     which can be scalar, tuple, list, numpy\.ndarray, paddle\.Tensor.
 
-    If the ``data`` is already a tensor, and ``dtype`` or ``place`` does't change, no copy 
-    will be performed and return origin tensor, otherwise a new tensor will be constructed
-    and returned. 
+    If the ``data`` is already a Tensor, copy will be performed and return a new tensor.
+    If you only want to change stop_gradient property, please call ``Tensor.stop_gradient = stop_gradient`` directly.
 
     Args:
         data(scalar|tuple|list|ndarray|Tensor): Initial data for the tensor.
@@ -75,32 +74,31 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
         # <class 'paddle.Tensor'>
 
         paddle.to_tensor(1)
-        # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+        # Tensor(shape=[1], dtype=int64, place=CPUPlace, stop_gradient=True,
         #        [1])
 
-        x = paddle.to_tensor(1)
-        paddle.to_tensor(x, dtype='int32', place=paddle.CPUPlace()) # A new tensor will be constructed due to different dtype or place
-        # Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,
+        x = paddle.to_tensor(1, stop_gradient=False)
+        print(x)
+        # Tensor(shape=[1], dtype=int64, place=CPUPlace, stop_gradient=False,
         #        [1])
 
-        paddle.to_tensor((1.1, 2.2), place=paddle.CUDAPinnedPlace())
-        # Tensor(shape=[1], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,
-        #        [1])
+        paddle.to_tensor(x)  # A new tensor will be created with default stop_gradient=True
+        # Tensor(shape=[1], dtype=int64, place=CPUPlace, stop_gradient=True,
+        #        [1])        
 
-        paddle.to_tensor([[0.1, 0.2], [0.3, 0.4]], place=paddle.CUDAPlace(0), stop_gradient=False)
-        # Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
+        paddle.to_tensor([[0.1, 0.2], [0.3, 0.4]], place=paddle.CPUPlace(), stop_gradient=False)
+        # Tensor(shape=[2, 2], dtype=float32, place=CPUPlace, stop_gradient=False,
         #        [[0.10000000, 0.20000000],
         #         [0.30000001, 0.40000001]])
 
         type(paddle.to_tensor([[1+1j, 2], [3+2j, 4]], dtype='complex64'))
-        # <class 'paddle.VarBase'>
+        # <class 'paddle.Tensor'>
 
         paddle.to_tensor([[1+1j, 2], [3+2j, 4]], dtype='complex64')
-        # Tensor(shape=[2, 2], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        # Tensor(shape=[2, 2], dtype=complex64, place=CPUPlace, stop_gradient=True,
         #        [[(1+1j), (2+0j)],
         #         [(3+2j), (4+0j)]])
     """
-
     place = _get_paddle_place(place)
     if place is None:
         place = _current_expected_place()
@@ -119,10 +117,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
 
     if not isinstance(data, np.ndarray):
 
-        def _handle_diff_place_dtype(data, dtype, place, stop_gradient):
-            data.stop_gradient = stop_gradient
-            if not data.place._equals(place):
-                data = data._copy_to(place, False)
+        def _handle_dtype(data, dtype):
             if dtype:
                 if convert_dtype(dtype) != convert_dtype(data.dtype):
                     return data.astype(convert_dtype(dtype))
@@ -138,11 +133,17 @@ def _handle_diff_place_dtype(data, dtype, place, stop_gradient):
                     "this means the input data contains nested lists with different lengths. "
                 )
         elif isinstance(data, paddle.Tensor):
-            return _handle_diff_place_dtype(data, dtype, place, stop_gradient)
-        elif isinstance(data, (core.Tensor, core.LoDTensor)):
-            # convert LoDTensor to VarBase first, and then process it as input VarBase
+            data = data._copy_to(place, False)
+            ata = _handle_dtype(data, dtype)
+            data.stop_gradient = stop_gradient
+        elif isinstance(data, core.LoDTensor):
+            # convert LoDTensor to VarBase first
+            # Currenly, LoDTensor does no copy when places are same
             data = paddle.Tensor(data)
-            return _handle_diff_place_dtype(data, dtype, place, stop_gradient)
+            if not data.place._equals(place):
+                data = data._copy_to(place, False)
+            data = _handle_dtype(data, dtype)
+            data.stop_gradient = stop_gradient
         else:
             raise TypeError(
                 "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor".
@@ -584,7 +585,7 @@ def tril(x, diagonal=0, name=None):
 
     Args:
         x (Tensor): The input x which is a Tensor.
-            Support data types: ``float64``, ``float32``, ``int32``, ``int64``.
+            Support data types: ``bool``, ``float64``, ``float32``, ``int32``, ``int64``.
         diagonal (int, optional): The diagonal to consider, default value is 0.
             If :attr:`diagonal` = 0, all elements on and below the main diagonal are
             retained. A positive value includes just as many diagonals above the main
@@ -771,6 +772,131 @@ def meshgrid(*args, **kwargs):
     return out
 
 
+def diagflat(x, offset=0, name=None):
+    """
+    If ``x`` is a vector (1-D tensor), a 2-D square tensor whth the elements of ``x`` as the diagonal is returned.
+
+    If ``x`` is a tensor (more than 1-D), a 2-D square tensor with the elements of flattened ``x`` as the diagonal is returned.
+
+    The argument ``offset`` controls the diagonal offset.
+
+
+    If ``offset`` = 0, it is the main diagonal.
+
+    If ``offset`` > 0, it is superdiagonal.
+
+    If ``offset`` < 0, it is subdiagonal.
+
+    Args:
+        x (Tensor): The input tensor. It can be any shape. Its data type should be float32, float64, int32, int64.
+        offset (int, optional): The diagonal offset. A positive value represents superdiagonal, 0 represents the main diagonal, and a negative value represents subdiagonal. Default: 0 (main diagonal).
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, a square matrix. The output data type is the same as input data type.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+
+          x = paddle.to_tensor([1, 2, 3])
+          y = paddle.diagflat(x)
+          print(y.numpy())
+          # [[1 0 0]
+          #  [0 2 0]
+          #  [0 0 3]]
+
+          y = paddle.diagflat(x, offset=1)
+          print(y.numpy())
+          # [[0 1 0 0]
+          #  [0 0 2 0]
+          #  [0 0 0 3]
+          #  [0 0 0 0]]
+
+          y = paddle.diagflat(x, offset=-1)
+          print(y.numpy())
+          # [[0 0 0 0]
+          #  [1 0 0 0]
+          #  [0 2 0 0]
+          #  [0 0 3 0]]
+        
+        .. code-block:: python
+
+          import paddle
+
+          x = paddle.to_tensor([[1, 2], [3, 4]])
+          y = paddle.diagflat(x)
+          print(y.numpy())
+          # [[1 0 0 0]
+          #  [0 2 0 0]
+          #  [0 0 3 0]
+          #  [0 0 0 4]]
+
+          y = paddle.diagflat(x, offset=1)
+          print(y.numpy())
+          # [[0 1 0 0 0]
+          #  [0 0 2 0 0]
+          #  [0 0 0 3 0]
+          #  [0 0 0 0 4]
+          #  [0 0 0 0 0]]
+
+          y = paddle.diagflat(x, offset=-1)
+          print(y.numpy())
+          # [[0 0 0 0 0]
+          #  [1 0 0 0 0]
+          #  [0 2 0 0 0]
+          #  [0 0 3 0 0]
+          #  [0 0 0 4 0]]
+    """
+    padding_value = 0
+    if in_dygraph_mode():
+        if len(x.shape) == 1:
+            return core.ops.diag_v2(x, "offset", offset, "padding_value",
+                                    padding_value)
+        else:
+            y, _ = core.ops.flatten_contiguous_range(x, "start_axis", 0,
+                                                     "stop_axis", -1)
+            return core.ops.diag_v2(y, "offset", offset, "padding_value",
+                                    padding_value)
+
+    check_type(x, 'x', (Variable), 'diagflat')
+    check_dtype(x.dtype, 'x', ['float32', 'float64', 'int32', 'int64'],
+                'diagflat')
+    check_type(offset, 'offset', (int), 'diagflat')
+
+    helper = LayerHelper("diagflat", **locals())
+    out1 = helper.create_variable_for_type_inference(dtype=x.dtype)
+    out1_shape = helper.create_variable_for_type_inference(x.dtype)
+    out2 = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    if len(x.shape) == 1:
+        helper.append_op(
+            type='diag_v2',
+            inputs={'X': x},
+            outputs={'Out': out2},
+            attrs={'offset': offset,
+                   'padding_value': padding_value})
+    else:
+        helper.append_op(
+            type='flatten_contiguous_range',
+            inputs={'X': x},
+            outputs={'Out': out1,
+                     'XShape': out1_shape},
+            attrs={'start_axis': 0,
+                   'stop_axis': -1})
+        out1.stop_gradient = True
+
+        helper.append_op(
+            type='diag_v2',
+            inputs={'X': out1},
+            outputs={'Out': out2},
+            attrs={'offset': offset,
+                   'padding_value': padding_value})
+    out2.stop_gradient = True
+    return out2
+
+
 def diag(x, offset=0, padding_value=0, name=None):
     """
     If ``x`` is a vector (1-D tensor), a 2-D square tensor whth the elements of ``x`` as the diagonal is returned.
@@ -1053,3 +1179,64 @@ def assign(x, output=None):
     check_type(x, 'x', (Variable, np.ndarray, list, tuple, float, int, bool),
                'assign')
     return tensor.assign(x, output)
+
+
+#NOTE(zhiqiu): not public 
+def _memcpy(input, place=None, output=None):
+    """
+
+    The OP copies the :attr:`input` to the :attr:`output`.
+    NOTE: currently, only support CUDAPlace <-> CUDAPinnedPlace or NPUPlace <-> CPUPlace.
+
+    Parameters:
+        input (Tensor): A tensor. Its data type supports float16, float32, float64, int32, int64, and bool.
+        device (Place): Target place for the output.
+        output (Tensor, optional): A tensor. If :attr:`output` is None, a new tensor will
+            be created as :attr:`output`. Default: None.
+
+    Returns:
+        Tensor: A tensor with the same shape, data type and value as :attr:`input`.
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+          data = paddle.full(shape=[3, 2], fill_value=2.5, dtype='float64') # [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
+          result = paddle._memcpy(data, place=paddle.CPUPlace())  # result2 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
+    """
+    helper = LayerHelper('memcpy', **locals())
+    check_type(input, 'input', (Variable), 'memcpy')
+
+    if isinstance(input, (Variable, core.VarBase)):
+        check_dtype(input.dtype, 'input', [
+            'float16', 'uint16', 'float32', 'float64', 'int32', 'int64',
+            'uint8', 'bool'
+        ], 'memcpy', '(When the type of input in memcpy is Variable.)')
+    if output is None:
+        output = helper.create_variable_for_type_inference(dtype=input.dtype)
+
+    dst_place_type = -1
+    if place is None:
+        dst_place_type = -1
+    else:
+        p = core.Place()
+        p.set_place(place)
+        if p.is_cpu_place():
+            dst_place_type = 0
+        elif p.is_gpu_place():
+            dst_place_type = 1
+        elif p.is_cuda_pinned_place():
+            dst_place_type = 2
+        elif p.is_xpu_place():
+            dst_place_type = 3
+        elif p.is_npu_place():
+            dst_place_type = 4
+
+    attrs = {'dst_place_type': dst_place_type}
+    helper.append_op(
+        type='memcpy',
+        inputs={'X': [input]},
+        outputs={'Out': [output]},
+        attrs=attrs)
+    return output
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 2f69946c52139b..652c7c41fb8cc0 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -62,6 +62,7 @@
 from ..fluid.layers import sqrt    # noqa: F401
 from ..fluid.layers import sqrt_    # noqa: F401
 from ..fluid.layers import sin    # noqa: F401
+from ..fluid.layers import lgamma    # noqa: F401
 
 from ..fluid.layers import multiplex    # noqa: F401
 from ..fluid import layers
@@ -2280,3 +2281,27 @@ def conj(x, name=None):
 
     helper.append_op(type='conj', inputs={'X': x}, outputs={'Out': [out]})
     return out
+
+def neg(x, name=None):
+    """
+    This function computes the negative of the Tensor elementwisely.
+
+    Args:
+        x (Tensor): Input of neg operator, an N-D Tensor, with data type float32, float64, int8, int16, int32, or int64.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): The negative of input Tensor. The shape and data type are the same with input Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+            out = paddle.neg(x)
+            print(out)
+            # [0.4 0.2 -0.1 -0.3]
+    """
+
+    return layers.scale(x, scale=-1.0, bias=0.0, bias_after_scale=True, act=None, name=name)
diff --git a/python/paddle/tests/test_dataset_cifar.py b/python/paddle/tests/test_dataset_cifar.py
index abf79fb1e3974c..2e9efddf9712e3 100644
--- a/python/paddle/tests/test_dataset_cifar.py
+++ b/python/paddle/tests/test_dataset_cifar.py
@@ -32,6 +32,8 @@ def test_main(self):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
+        self.assertTrue(len(label.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 9)
 
 
@@ -49,6 +51,8 @@ def test_main(self):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
+        self.assertTrue(len(label.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 9)
 
         # test cv2 backend
@@ -63,6 +67,8 @@ def test_main(self):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
+        self.assertTrue(len(label.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 99)
 
         with self.assertRaises(ValueError):
@@ -83,6 +89,8 @@ def test_main(self):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
+        self.assertTrue(len(label.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 99)
 
 
@@ -100,6 +108,8 @@ def test_main(self):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
+        self.assertTrue(len(label.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 99)
 
         # test cv2 backend
@@ -114,6 +124,8 @@ def test_main(self):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
+        self.assertTrue(len(label.shape) == 1)
+        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 99)
 
         with self.assertRaises(ValueError):
diff --git a/python/paddle/tests/test_download.py b/python/paddle/tests/test_download.py
index 4be2dde1bccb13..986d84dd153b2f 100644
--- a/python/paddle/tests/test_download.py
+++ b/python/paddle/tests/test_download.py
@@ -77,6 +77,31 @@ def test_retry_exception(self, ):
                 'www.baidu.com',
                 './test', )
 
+    def test_wget_download_error(self, ):
+        with self.assertRaises(RuntimeError):
+            from paddle.utils.download import _download
+            _download('www.baidu', './test', method='wget')
+
+    def test_download_methods(self, ):
+        urls = [
+            "https://paddle-hapi.bj.bcebos.com/unittest/files.tar",
+            "https://paddle-hapi.bj.bcebos.com/unittest/files.zip",
+        ]
+
+        import sys
+        from paddle.utils.download import _download
+        if sys.platform == 'linux':
+            methods = ['wget', 'get']
+        else:
+            methods = ['get']
+
+        for url in urls:
+            for method in methods:
+                _download(
+                    url,
+                    path='./test',
+                    method=method, )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index ae574a8241bfff..0ced69c0f2ea96 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -126,7 +126,7 @@ class TestModel(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
         if not fluid.is_compiled_with_cuda():
-            self.skipTest('module not tested when ONLY_CPU compling')
+            cls.skipTest('module not tested when ONLY_CPU compling')
         cls.device = paddle.set_device('gpu')
         fluid.enable_dygraph(cls.device)
 
diff --git a/python/paddle/text/datasets/wmt14.py b/python/paddle/text/datasets/wmt14.py
index 424a564216d190..38ca09bf299831 100644
--- a/python/paddle/text/datasets/wmt14.py
+++ b/python/paddle/text/datasets/wmt14.py
@@ -14,9 +14,11 @@
 
 from __future__ import print_function
 
+import six
 import tarfile
 import numpy as np
 import gzip
+import six
 
 from paddle.io import Dataset
 import paddle.compat as cpt
diff --git a/python/paddle/utils/download.py b/python/paddle/utils/download.py
index 3ad627ddea9274..29baddff05af22 100644
--- a/python/paddle/utils/download.py
+++ b/python/paddle/utils/download.py
@@ -21,6 +21,7 @@
 import os.path as osp
 import shutil
 import requests
+import subprocess
 import hashlib
 import tarfile
 import zipfile
@@ -121,7 +122,8 @@ def get_path_from_url(url,
                       root_dir,
                       md5sum=None,
                       check_exist=True,
-                      decompress=True):
+                      decompress=True,
+                      method='get'):
     """ Download from given url to root_dir.
     if file or directory specified by url is exists under
     root_dir, return the path directly, otherwise download
@@ -132,7 +134,9 @@ def get_path_from_url(url,
         root_dir (str): root dir for downloading, it should be
                         WEIGHTS_HOME or DATASET_HOME
         md5sum (str): md5 sum of download package
-    
+        decompress (bool): decompress zip or tar file. Default is `True`
+        method (str): which download method to use. Support `wget` and `get`. Default is `get`.
+
     Returns:
         str: a local path to save downloaded models & weights & datasets.
     """
@@ -150,7 +154,7 @@ def get_path_from_url(url,
         logger.info("Found {}".format(fullpath))
     else:
         if ParallelEnv().current_endpoint in unique_endpoints:
-            fullpath = _download(url, root_dir, md5sum)
+            fullpath = _download(url, root_dir, md5sum, method=method)
         else:
             while not os.path.exists(fullpath):
                 time.sleep(1)
@@ -163,13 +167,79 @@ def get_path_from_url(url,
     return fullpath
 
 
-def _download(url, path, md5sum=None):
+def _get_download(url, fullname):
+    # using requests.get method
+    fname = osp.basename(fullname)
+    try:
+        req = requests.get(url, stream=True)
+    except Exception as e:  # requests.exceptions.ConnectionError
+        logger.info("Downloading {} from {} failed with exception {}".format(
+            fname, url, str(e)))
+        return False
+
+    if req.status_code != 200:
+        raise RuntimeError("Downloading from {} failed with code "
+                           "{}!".format(url, req.status_code))
+
+    # For protecting download interupted, download to
+    # tmp_fullname firstly, move tmp_fullname to fullname
+    # after download finished
+    tmp_fullname = fullname + "_tmp"
+    total_size = req.headers.get('content-length')
+    with open(tmp_fullname, 'wb') as f:
+        if total_size:
+            with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
+                for chunk in req.iter_content(chunk_size=1024):
+                    f.write(chunk)
+                    pbar.update(1)
+        else:
+            for chunk in req.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+    shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _wget_download(url, fullname):
+    # using wget to download url
+    tmp_fullname = fullname + "_tmp"
+    # –user-agent
+    command = 'wget -O {} -t {} {}'.format(tmp_fullname, DOWNLOAD_RETRY_LIMIT,
+                                           url)
+    subprc = subprocess.Popen(
+        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    _ = subprc.communicate()
+
+    if subprc.returncode != 0:
+        raise RuntimeError(
+            '{} failed. Please make sure `wget` is installed or {} exists'.
+            format(command, url))
+
+    shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+_download_methods = {
+    'get': _get_download,
+    'wget': _wget_download,
+}
+
+
+def _download(url, path, md5sum=None, method='get'):
     """
     Download from url, save to path.
 
     url (str): download url
     path (str): download to given path
+    md5sum (str): md5 sum of download package
+    method (str): which download method to use. Support `wget` and `get`. Default is `get`.
+
     """
+    assert method in _download_methods, 'make sure `{}` implemented'.format(
+        method)
+
     if not osp.exists(path):
         os.makedirs(path)
 
@@ -177,6 +247,7 @@ def _download(url, path, md5sum=None):
     fullname = osp.join(path, fname)
     retry_cnt = 0
 
+    logger.info("Downloading {} from {}".format(fname, url))
     while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
         if retry_cnt < DOWNLOAD_RETRY_LIMIT:
             retry_cnt += 1
@@ -184,38 +255,10 @@ def _download(url, path, md5sum=None):
             raise RuntimeError("Download from {} failed. "
                                "Retry limit reached".format(url))
 
-        logger.info("Downloading {} from {}".format(fname, url))
-
-        try:
-            req = requests.get(url, stream=True)
-        except Exception as e:  # requests.exceptions.ConnectionError
-            logger.info(
-                "Downloading {} from {} failed {} times with exception {}".
-                format(fname, url, retry_cnt + 1, str(e)))
+        if not _download_methods[method](url, fullname):
             time.sleep(1)
             continue
 
-        if req.status_code != 200:
-            raise RuntimeError("Downloading from {} failed with code "
-                               "{}!".format(url, req.status_code))
-
-        # For protecting download interupted, download to
-        # tmp_fullname firstly, move tmp_fullname to fullname
-        # after download finished
-        tmp_fullname = fullname + "_tmp"
-        total_size = req.headers.get('content-length')
-        with open(tmp_fullname, 'wb') as f:
-            if total_size:
-                with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
-                    for chunk in req.iter_content(chunk_size=1024):
-                        f.write(chunk)
-                        pbar.update(1)
-            else:
-                for chunk in req.iter_content(chunk_size=1024):
-                    if chunk:
-                        f.write(chunk)
-        shutil.move(tmp_fullname, fullname)
-
     return fullname
 
 
diff --git a/python/paddle/vision/datasets/cifar.py b/python/paddle/vision/datasets/cifar.py
index 0a0a48026af80e..ff3734bf7a030d 100644
--- a/python/paddle/vision/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
@@ -151,7 +151,8 @@ def _load_data(self):
                     six.b('labels'), batch.get(six.b('fine_labels'), None))
                 assert labels is not None
                 for sample, label in six.moves.zip(data, labels):
-                    self.data.append((sample, label))
+                    self.data.append((sample,
+                                      np.array([label]).astype('int64')))
 
     def __getitem__(self, idx):
         image, label = self.data[idx]
@@ -164,9 +165,9 @@ def __getitem__(self, idx):
             image = self.transform(image)
 
         if self.backend == 'pil':
-            return image, np.array(label).astype('int64')
+            return image, label.astype('int64')
 
-        return image.astype(self.dtype), np.array(label).astype('int64')
+        return image.astype(self.dtype), label.astype('int64')
 
     def __len__(self):
         return len(self.data)
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 60a7a90c9be895..769e33c7355791 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -247,7 +247,9 @@ def yolo_box(x,
              downsample_ratio,
              clip_bbox=True,
              name=None,
-             scale_x_y=1.):
+             scale_x_y=1.,
+             iou_aware=False,
+             iou_aware_factor=0.5):
     r"""
 
     This operator generates YOLO detection boxes from output of YOLOv3 network.
@@ -256,7 +258,8 @@ def yolo_box(x,
     should be the same, H and W specify the grid size, each grid point predict 
     given number boxes, this given number, which following will be represented as S,
     is specified by the number of anchors. In the second dimension(the channel
-    dimension), C should be equal to S * (5 + class_num), class_num is the object 
+    dimension), C should be equal to S * (5 + class_num) if :attr:`iou_aware` is false,
+    otherwise C should be equal to S * (6 + class_num). class_num is the object
     category number of source dataset(such as 80 in coco dataset), so the 
     second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
     also includes confidence score of the box and class one-hot key of each anchor 
@@ -292,6 +295,15 @@ def yolo_box(x,
     score_{pred} = score_{conf} * score_{class}
     $$
 
+    where the confidence scores follow the formula bellow
+
+    .. math::
+
+        score_{conf} = \begin{case}
+                         obj, \text{if } iou_aware == flase \\
+                         obj^{1 - iou_aware_factor} * iou^{iou_aware_factor}, \text{otherwise}
+                       \end{case}
+
     Args:
         x (Tensor): The input tensor of YoloBox operator is a 4-D tensor with
                       shape of [N, C, H, W]. The second dimension(C) stores box
@@ -313,13 +325,14 @@ def yolo_box(x,
                                 should be set for the first, second, and thrid
                                 :attr:`yolo_box` layer.
         clip_bbox (bool): Whether clip output bonding box in :attr:`img_size`
-                          boundary. Default true."
-        "
+                          boundary. Default true.
         scale_x_y (float): Scale the center point of decoded bounding box.
                            Default 1.0
         name (string): The default value is None.  Normally there is no need 
                        for user to set this property.  For more information, 
                        please refer to :ref:`api_guide_Name`
+        iou_aware (bool): Whether use iou aware. Default false
+        iou_aware_factor (float): iou aware factor. Default 0.5
 
     Returns:
         Tensor: A 3-D tensor with shape [N, M, 4], the coordinates of boxes,
@@ -358,7 +371,8 @@ def yolo_box(x,
         boxes, scores = core.ops.yolo_box(
             x, img_size, 'anchors', anchors, 'class_num', class_num,
             'conf_thresh', conf_thresh, 'downsample_ratio', downsample_ratio,
-            'clip_bbox', clip_bbox, 'scale_x_y', scale_x_y)
+            'clip_bbox', clip_bbox, 'scale_x_y', scale_x_y, 'iou_aware',
+            iou_aware, 'iou_aware_factor', iou_aware_factor)
         return boxes, scores
 
     helper = LayerHelper('yolo_box', **locals())
@@ -378,6 +392,8 @@ def yolo_box(x,
         "downsample_ratio": downsample_ratio,
         "clip_bbox": clip_bbox,
         "scale_x_y": scale_x_y,
+        "iou_aware": iou_aware,
+        "iou_aware_factor": iou_aware_factor
     }
 
     helper.append_op(
diff --git a/python/paddle/vision/transforms/functional_cv2.py b/python/paddle/vision/transforms/functional_cv2.py
index 99cbfd6dc4f8dd..8ebe542c645c50 100644
--- a/python/paddle/vision/transforms/functional_cv2.py
+++ b/python/paddle/vision/transforms/functional_cv2.py
@@ -392,7 +392,8 @@ def adjust_hue(img, hue_factor):
     cv2 = try_import('cv2')
 
     if not (-0.5 <= hue_factor <= 0.5):
-        raise ValueError('hue_factor is not in [-0.5, 0.5].'.format(hue_factor))
+        raise ValueError('hue_factor:{} is not in [-0.5, 0.5].'.format(
+            hue_factor))
 
     dtype = img.dtype
     img = img.astype(np.uint8)
diff --git a/python/paddle/vision/transforms/functional_pil.py b/python/paddle/vision/transforms/functional_pil.py
index eee60c5452b2de..d94309bcb88424 100644
--- a/python/paddle/vision/transforms/functional_pil.py
+++ b/python/paddle/vision/transforms/functional_pil.py
@@ -378,7 +378,8 @@ def adjust_hue(img, hue_factor):
 
     """
     if not (-0.5 <= hue_factor <= 0.5):
-        raise ValueError('hue_factor is not in [-0.5, 0.5].'.format(hue_factor))
+        raise ValueError('hue_factor:{} is not in [-0.5, 0.5].'.format(
+            hue_factor))
 
     input_mode = img.mode
     if input_mode in {'L', '1', 'I', 'F'}:
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 00e12689c4d9fe..eb7bc595c16eb3 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -854,13 +854,13 @@ class ColorJitter(BaseTransform):
     """Randomly change the brightness, contrast, saturation and hue of an image.
 
     Args:
-        brightness: How much to jitter brightness.
+        brightness (float): How much to jitter brightness.
             Chosen uniformly from [max(0, 1 - brightness), 1 + brightness]. Should be non negative numbers.
-        contrast: How much to jitter contrast.
+        contrast (float): How much to jitter contrast.
             Chosen uniformly from [max(0, 1 - contrast), 1 + contrast]. Should be non negative numbers.
-        saturation: How much to jitter saturation.
+        saturation (float): How much to jitter saturation.
             Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]. Should be non negative numbers.
-        hue: How much to jitter hue.
+        hue (float): How much to jitter hue.
             Chosen uniformly from [-hue, hue]. Should have 0<= hue <= 0.5.
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 98d05c367f1623..866c2b400d5ca6 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -213,6 +213,7 @@ packages=['paddle',
           'paddle.nn',
           'paddle.nn.functional',
           'paddle.nn.layer',
+          'paddle.nn.quant',
           'paddle.nn.initializer',
           'paddle.nn.utils',
           'paddle.metric',
diff --git a/tools/CrossStackProfiler/CspChromeTraceFormatter.py b/tools/CrossStackProfiler/CspChromeTraceFormatter.py
new file mode 100755
index 00000000000000..a8030988aacf1a
--- /dev/null
+++ b/tools/CrossStackProfiler/CspChromeTraceFormatter.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import six
+import sys
+import re
+import os
+import glob
+import unittest
+import pandas
+import tempfile
+import platform
+import pandas as pd
+
+
+class ChromeTraceFormatter(object):
+    def __init__(self):
+        self._events = []
+        self._metadata = []
+
+    def _create_event(self, ph, category, name, pid, tid, timestamp):
+        """Creates a new Chrome Trace event.
+
+        For details of the file format, see:
+        https://github.com/catapult-project/catapult/blob/master/tracing/README.md
+
+        Args:
+          ph:  The type of event - usually a single character.
+          category: The event category as a string.
+          name:  The event name as a string.
+          pid:  Identifier of the process generating this event as an integer.
+          tid:  Identifier of the thread generating this event as an integer.
+          timestamp:  The timestamp of this event as a long integer.
+
+        Returns:
+          A JSON compatible event object.
+        """
+        event = {}
+        event['ph'] = ph
+        event['cat'] = category
+        event['name'] = name
+        event['pid'] = pid
+        event['tid'] = tid
+        event['ts'] = timestamp
+        return event
+
+    def emit_pid(self, name, pid):
+        """Adds a process metadata event to the trace.
+
+        Args:
+          name:  The process name as a string.
+          pid:  Identifier of the process as an integer.
+        """
+        event = {}
+        event['name'] = 'process_name'
+        event['ph'] = 'M'
+        event['pid'] = pid
+        event['args'] = {'name': name}
+        self._metadata.append(event)
+
+    def emit_region(self, timestamp, duration, pid, tid, category, name, args):
+        """Adds a region event to the trace.
+
+        Args:
+          timestamp:  The start timestamp of this region as a long integer.
+          duration:  The duration of this region as a long integer.
+          pid:  Identifier of the process generating this event as an integer.
+          tid:  Identifier of the thread generating this event as an integer.
+          category: The event category as a string.
+          name:  The event name as a string.
+          args:  A JSON-compatible dictionary of event arguments.
+        """
+        event = self._create_event('X', category, name, pid, tid, timestamp)
+        event['dur'] = duration
+        event['args'] = args
+        self._events.append(event)
+
+    def emit_counter(self, category, name, pid, timestamp, counter, value):
+        """Emits a record for a single counter.
+
+        Args:
+            category: The event category as string
+            name: The event name as string
+            pid: Identifier of the process generating this event as integer
+            timestamp: The timestamps of this event as long integer
+            counter: Name of the counter as string
+            value: Value of the counter as integer
+            tid: Thread id of the allocation as integer
+        """
+        event = self._create_event('C', category, name, pid, 0, timestamp)
+        event['args'] = {counter: value}
+        self._events.append(event)
+
+    def format_to_string(self, pretty=False):
+        """Formats the chrome trace to a string.
+
+        Args:
+          pretty: (Optional.)  If True, produce human-readable JSON output.
+
+        Returns:
+          A JSON-formatted string in Chrome Trace format.
+        """
+        trace = {}
+        trace['traceEvents'] = self._metadata + self._events
+        if pretty:
+            return json.dumps(trace, indent=4, separators=(',', ': '))
+        else:
+            return json.dumps(trace, separators=(',', ':'))
+
+    def clear(self):
+        self._events = []
+        self._metadata = []
+
+
+if __name__ == "__main__":
+    pass
diff --git a/tools/CrossStackProfiler/CspFileReader.py b/tools/CrossStackProfiler/CspFileReader.py
new file mode 100755
index 00000000000000..12de488aa693eb
--- /dev/null
+++ b/tools/CrossStackProfiler/CspFileReader.py
@@ -0,0 +1,400 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import json
+import glob
+import logging
+import pandas as pd
+from multiprocessing import Process, Lock
+""" Some terms to clarify the code
+    in most case, one or more paremeters may be set as input args for a class or a function
+    in form of single variable or k-v dict
+
+    1.  trainerId
+    2.  gpuId
+    3.  rankId
+    4.  gpuPerTrainer
+    5.  groupSize
+    6.  groupId
+    7.  groupNum
+    8.  displaySize
+    9.  dataPath
+    10. resultPath
+    11. fileOrganizeForm -- "byRank" OR "byTrainer" or "other"
+
+"""
+
+PIPELINEINFO_TRACE_NUM = 1
+
+dcgmMetricParameterMap = {
+    "02_gpuUtility": [("GPUTL", "GPUTL"), ("GRACT", "GRACT")],
+    "03_smUtility": [("SMACT", "SMACT"), ("SMOCC", "SMOCC")],
+    "04_memUtility": [("FB_USED_RATIO", "FB_USED_RATIO"), ("DRAMA", "DRAMA")],
+    "05_txUtility": [("NVLTX", "NVLTX"), ("NVLRX", "NVLRX"), ("PCITX", "PCITX"),
+                     ("PCIRX", "PCIRX")],
+    "06_calUtility":
+    [("FP32A", "FP32A"), ("FP16A", "FP16A"), ("TENSO", "TENSO")]
+}
+DCGMINFO_TRACE_NUM = len(dcgmMetricParameterMap.keys())
+NETINFO_TRACE_NUM = 2
+
+DCGM_PATH = "dcgm"
+NET_PATH = "net"
+TIME_PATH = "time"
+PROFILE_PATH = "profile"
+
+FILEORGANIZEFORM_BYRANK = "byRank"
+FILEORGANIZEFORM_BYTRAINER = "byTrainer"
+FILEORGANIZEFORM_BYOTHER = "other"
+FILEORGANIZEFORM = [
+    FILEORGANIZEFORM_BYRANK, FILEORGANIZEFORM_BYTRAINER,
+    FILEORGANIZEFORM_BYOTHER
+]
+
+
+class FileReader(object):
+    def __init__(self, logger, args):
+        self._logger = logger
+        self._args = args
+
+        self._fileList = []
+        self._fileNum = 0
+
+        self._dataPath = ""
+        self._groupSize = 0
+        self._displaySize = 0
+        self._organizeForm = FILEORGANIZEFORM_BYOTHER
+        self._gpuPerTrainer = 0
+
+        self._checkArgs()
+        self._getFileList()
+
+        self._lock = Lock()
+
+    def printArgs(self):
+        self._logger.info("dataPath:")
+        self._logger.info(self._dataPath)
+        self._logger.info("groupSize:")
+        self._logger.info(self._groupSize)
+        self._logger.info("displaySize:")
+        self._logger.info(self._displaySize)
+        self._logger.info("organizeForm:")
+        self._logger.info(self._organizeForm)
+        self._logger.info("gpuPerTrainer:")
+        self._logger.info(self._gpuPerTrainer)
+        self._logger.info("minTimeStamp:")
+        self._logger.info(self._minTimeStamp)
+
+    def _checkArgsKey(self, key, type):
+        if not self._args.has_key(key):
+            raise KeyError("args should has key [%s]!" % key)
+
+        if not isinstance(self._args[key], type):
+            raise TypeError(
+                "Invalid type of key [%s] in args dict, it should be a %s!" %
+                (key, type))
+
+        exec("self._%s = self._args[\"%s\"]" % (key, key))
+
+    def _align_ts(self, ts):
+        return ts - self._minTimeStamp
+
+    def _checkArgs(self):
+        if not isinstance(self._args, dict):
+            raise TypeError("Invalid type of args, it should be a dict!")
+
+        self._checkArgsKey("organizeForm", str)
+        if self._organizeForm not in FILEORGANIZEFORM or \
+            self._organizeForm == FILEORGANIZEFORM_BYOTHER:
+            raise NotImplementedError(
+                "we have not known how to process this form of file [%s]!" %
+                self._organizeForm)
+
+        self._checkArgsKey("gpuPerTrainer", int)
+
+        self._checkArgsKey("dataPath", str)
+        if not os.path.exists(self._dataPath):
+            raise IOError("input data path [%s] not existed!" %
+                          (self._dataPath))
+
+        self._checkArgsKey("groupSize", int)
+        self._checkArgsKey("displaySize", int)
+        self._checkArgsKey("minTimeStamp", int)
+
+    def getFileListByGroup(self, groupId):
+        lIndext = 0
+        rIndext = 0
+
+        if self._organizeForm == FILEORGANIZEFORM_BYTRAINER:
+            lIndext = groupId * self._groupSize
+            rIndext = (groupId + 1) * self._groupSize
+        elif self._organizeForm == FILEORGANIZEFORM_BYRANK:
+            lIndext = groupId * self._groupSize * self._gpuPerTrainer
+            rIndext = (groupId + 1) * self._groupSize * self._gpuPerTrainer
+
+        try:
+            return self._fileList[lIndext:rIndext]
+        except IndexError:
+            raise IndexError("invalid index of file list")
+
+    def getFileList(self):
+        return self._getFileList
+
+    def _cmp(self, x, y):
+        return self._getId(x, self._organizeForm) - self._getId(
+            y, self._organizeForm)
+
+    def _getFileList(self):
+        self._fileList = glob.glob(os.path.join(self._dataPath, "*.*"))
+
+        # check unique
+        idList = []
+        newFileList = []
+        for file in self._fileList:
+            id = self._getId(file, self._organizeForm)
+            if id not in idList:
+                idList.append(id)
+                newFileList.append(file)
+            else:
+                raise NotImplementedError(
+                    "[%s] is repeated by id, we don not how to process it!" %
+                    file)
+
+        if not self._fileList:
+            if (self._getId(self._fileList[-1]) - self._getId(self._fileList[0])
+                ) != len(self._fileList) - 1:
+                raise Exception("The file id should be countious!")
+        # sort
+        def _sortBySuffix(elem):
+            return int(elem.split(".")[-1])
+
+        self._fileList.sort(key=_sortBySuffix)
+
+        if not self._fileList:
+            self._logger.warning("we can not find any file in dir [%s]!" %
+                                 self._dataPath)
+        else:
+            self._logger.info("file list in dir [%s] is : %s !" %
+                              (self._dataPath, ',  '.join(self._fileList)))
+
+        return self._fileList
+
+    def _getId(self, fileName, organizeForm, sed="."):
+        if self._organizeForm != organizeForm:
+            raise TypeError("Can not get rank id when organizer form is not %s!"
+                            % organizeForm)
+
+        if not os.path.isfile(fileName):
+            raise IOError("[%s] is not a valid file!" % (fileName))
+
+        try:
+            prefix_str = fileName.split(sed)[-1]
+            try:
+                return int(prefix_str)
+            except ValueError, Argument:
+                print(Argument)
+                raise TypeError("invalid fileName [%s]" % fileName)
+
+        except IndexError, Argument:
+            print(Argument)
+            raise TypeError(
+                "invalid fileName [%s], the prefix should be a number!" %
+                fileName)
+
+    def getRankId(self, fileName, sed="."):
+        return self._getId(fileName, FILEORGANIZEFORM_BYRANK, sed)
+
+    def getRankNum(self):
+        if self._organizeForm == FILEORGANIZEFORM_BYRANK:
+            return len(self._fileList)
+
+        elif self._organizeForm == FILEORGANIZEFORM_BYTRAINER:
+            return len(self._fileList) * self._gpuPerTrainer
+
+    def getTrainerNum(self):
+        if self._organizeForm == FILEORGANIZEFORM_BYRANK:
+            return len(self._fileList) / self._gpuPerTrainer
+
+        elif self._organizeForm == FILEORGANIZEFORM_BYTRAINER:
+            return len(self._fileList)
+
+    def getTrainerId(self, fileName, sed="."):
+        return self._getId(fileName, FILEORGANIZEFORM_BYTRAINER, sed)
+
+    def _splitTaskListForMultiProcess(self, ls, n):
+        if not isinstance(ls, list) or not isinstance(n, int):
+            return []
+        ls_len = len(ls)
+        if n <= 0 or 0 == ls_len:
+            return []
+        if n >= ls_len:
+            return [[i] for i in ls]
+        else:
+            j = int((ls_len + n - 1) / n)
+            k = ls_len % n
+            ls_return = []
+            end = 0
+            for i in range(0, (n) * j, j):
+                if i < len(ls) and (i + j) < len(ls):
+                    ls_return.append(ls[i:i + j])
+                    end = i + j
+            ls_return.append(ls[end:])
+            return ls_return
+
+    def getOpInfoFileName(self, groupId, gpuId, tmpPath="./tmp"):
+        return self.getFileName("opinfo", groupId, gpuId, tmpPath)
+
+    def getPipeLineInfoFileName(self, groupId, gpuId, tmpPath="./tmp"):
+        return self.getFileName("pipilineinfo", groupId, gpuId, tmpPath)
+
+    def getDCGMInfoFileName(self, groupId, gpuId, tmpPath="./tmp"):
+        return self.getFileName("dcgm", groupId, gpuId, tmpPath)
+
+    def getFileName(self, name, groupId, gpuId, tmpPath="./tmp"):
+        return os.path.join(tmpPath, "%s_%d_%d.json" % (name, groupId, gpuId))
+
+    def getOpInfoDict(self, groupId, gpuId, tmpPath="./tmp"):
+        return self.getDict("opinfo", groupId, gpuId, tmpPath)
+
+    def getDcgmInfoDict(self, groupId, gpuId, tmpPath="./tmp"):
+        return self.getDict("dcgm", groupId, gpuId, tmpPath)
+
+    def getDict(self, name, groupId, gpuId, tmpPath="./tmp"):
+        fileName = self.getFileName(name, groupId, gpuId, tmpPath)
+        if not os.path.isfile(fileName):
+            raise IOError("[%s] is not existed!" % fileName)
+
+        data = {}
+        with open(fileName, "r") as rf:
+            try:
+                data = json.load(rf)
+            except Exception:
+                self._logger.error("read [%s] error. not a json file!" %
+                                   (fileName))
+                raise TypeError("read [%s] error. not a json file!" %
+                                (fileName))
+        return data
+
+    def dumpOpInfoDict(self,
+                       data,
+                       groupId,
+                       gpuId,
+                       pretty=False,
+                       tmpPath="./tmp"):
+        return self.dumpDict(
+            data, "opinfo", groupId, gpuId, pretty=False, tmpPath="./tmp")
+
+    def dumpDCGMDict(self, data, groupId, gpuId, pretty=False, tmpPath="./tmp"):
+        return self.dumpDict(
+            data, "dcgm", groupId, gpuId, pretty=False, tmpPath="./tmp")
+
+    def dumpDict(self,
+                 data,
+                 name,
+                 groupId,
+                 gpuId,
+                 pretty=False,
+                 tmpPath="./tmp"):
+        self._lock.acquire()
+        if not os.path.exists(tmpPath):
+            os.makedirs(tmpPath)
+        self._lock.release()
+        if pretty:
+            jsObj = json.dumps(data, indent=4, separators=(',', ': '))
+        else:
+            jsObj = json.dumps(data, separators=(',', ':'))
+
+        fileName = self.getFileName(name, groupId, gpuId, tmpPath)
+        if os.path.isfile(fileName):
+            os.remove(fileName)
+
+        fileObject = open(fileName, 'w')
+        fileObject.write(jsObj)
+        fileObject.close()
+        self._logger.info("dump [%s] sucessfully!" % fileName)
+
+
+def getLogger():
+    logger = logging.getLogger()
+    logger.setLevel(logging.DEBUG)
+
+    rq = time.strftime('%Y%m%d%H%M.%s', time.localtime(time.time()))
+    log_path = os.path.dirname(os.getcwd()) + '/Logs/'
+    if not os.path.exists(log_path):
+        os.makedirs(log_path)
+
+    log_name = log_path + rq + '.log'
+    logfile = log_name
+    fh = logging.FileHandler(logfile, mode='w')
+    fh.setLevel(logging.DEBUG)
+
+    formatter = logging.Formatter(
+        "%(asctime)s - %(filename)s[line:%(lineno)d] - %(process)d - %(levelname)s: %(message)s"
+    )
+    fh.setFormatter(formatter)
+
+    logger.addHandler(fh)
+    return logger
+
+
+def test_FileReader(args):
+    try:
+        testReader = FileReader(None, args)
+    except Exception, Argument:
+        print(Argument)
+    else:
+        testReader.printArgs()
+
+
+if __name__ == "__main__":
+    args = 0
+    test_FileReader(args)
+
+    args = {
+        "dataPath": ".",
+        "groupSize": 1,
+        "displaySize": 1,
+        "gpuPerTrainer": 8,
+        "organizeForm": FILEORGANIZEFORM_BYOTHER,
+    }
+    test_FileReader(args)
+
+    args = {
+        "dataPath": ".",
+        "groupSize": 1,
+        "displaySize": 1,
+        "gpuPerTrainer": 8,
+        "organizeForm": FILEORGANIZEFORM_BYTRAINER,
+    }
+    test_FileReader(args)
+
+    args = {
+        "dataPath": "./res",
+        "groupSize": 1,
+        "displaySize": 1,
+        "gpuPerTrainer": 8,
+        "organizeForm": FILEORGANIZEFORM_BYTRAINER,
+    }
+    test_FileReader(args)
+
+    args = {
+        "dataPath": ".",
+        "groupSize": "",
+        "displaySize": 1,
+        "gpuPerTrainer": 8,
+        "organizeForm": FILEORGANIZEFORM_BYTRAINER,
+    }
+    test_FileReader(args)
diff --git a/tools/CrossStackProfiler/CspReporter.py b/tools/CrossStackProfiler/CspReporter.py
new file mode 100755
index 00000000000000..1b8ae0e3855348
--- /dev/null
+++ b/tools/CrossStackProfiler/CspReporter.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import glob
+import logging
+import argparse
+import multiprocessing
+
+import pandas as pd
+from multiprocessing import Process
+
+from NetFileReader import netFileReader
+from DCGMFileReader import dcgmFileReader
+from ProfileFileReader import profileFileReader
+
+from CspFileReader import getLogger
+from CspFileReader import TIME_PATH, DCGM_PATH, NET_PATH, PROFILE_PATH
+from CspFileReader import NETINFO_TRACE_NUM, DCGMINFO_TRACE_NUM, PIPELINEINFO_TRACE_NUM
+from CspFileReader import FILEORGANIZEFORM_BYRANK, FILEORGANIZEFORM_BYTRAINER, FILEORGANIZEFORM_BYOTHER, FILEORGANIZEFORM
+
+
+def get_argparse():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        '--profile_path',
+        type=str,
+        default='.',
+        help='Working path that store the monitor data.')
+
+    parser.add_argument(
+        '--timeline_path',
+        type=str,
+        default='.',
+        help='Output timeline file name.')
+
+    parser.add_argument(
+        '--gpuPerTrainer', type=int, default=8, help='Gpus per trainer.')
+
+    parser.add_argument(
+        '--trainerNum', type=int, default=4, help='Num of trainer.')
+
+    parser.add_argument(
+        '--groupSize', type=int, default=8, help='Num of trainer in a group.')
+
+    parser.add_argument(
+        '--displaySize',
+        type=int,
+        default=2,
+        help='Num of line need to display in a group.')
+
+    return parser.parse_args()
+
+
+class CspReporter(object):
+    def __init__(self, args):
+        self._args = args
+        print(self._args)
+
+        self._workPath = self._args.profile_path
+        self._saveFilePath = self._args.timeline_path
+        self._gpuPerTrainer = self._args.gpuPerTrainer
+        self._groupSize = self._args.groupSize
+        self._displaySize = self._args.displaySize
+        self._trainerNum = self._args.trainerNum
+
+        self._checkArgs()
+
+        self._init_logger()
+        self._init_timeInfo()
+        self._init_reader()
+
+    def _checkArgs(self):
+        if self._trainerNum % self._groupSize != 0:
+            raise Exception(
+                "Input args error: trainerNum[%d] %% groupSize[%d] != 0" %
+                (self._trainerNum, self._groupSize))
+
+    def _init_logger(self):
+        self._logger = getLogger()
+
+    def _init_reader(self):
+        self._dcgmPath = os.path.join(self._workPath, DCGM_PATH)
+        self._netPath = os.path.join(self._workPath, NET_PATH)
+        self._profilePath = os.path.join(self._workPath, PROFILE_PATH)
+
+        self._netFileReaderArgs = {
+            "dataPath": self._netPath,
+            "groupSize": self._groupSize,
+            "displaySize": self._displaySize,
+            "gpuPerTrainer": self._gpuPerTrainer,
+            "minTimeStamp": self._minTimeStamp,
+            "organizeForm": FILEORGANIZEFORM_BYTRAINER,
+        }
+
+        self._dcgmFileReaderArgs = {
+            "dataPath": self._dcgmPath,
+            "groupSize": self._groupSize,
+            "displaySize": self._displaySize,
+            "gpuPerTrainer": self._gpuPerTrainer,
+            "minTimeStamp": self._minTimeStamp,
+            "organizeForm": FILEORGANIZEFORM_BYTRAINER,
+        }
+
+        self._profileFileReaderArgs = {
+            "dataPath": self._profilePath,
+            "groupSize": self._groupSize,
+            "displaySize": self._displaySize,
+            "gpuPerTrainer": self._gpuPerTrainer,
+            "minTimeStamp": self._minTimeStamp,
+            "organizeForm": FILEORGANIZEFORM_BYRANK,
+        }
+
+        self._dcgmFileReader = dcgmFileReader(self._logger,
+                                              self._dcgmFileReaderArgs)
+        self._profileFileReader = profileFileReader(self._logger,
+                                                    self._profileFileReaderArgs)
+
+    def _init_timeInfo(self):
+        self._timePath = os.path.join(self._workPath, TIME_PATH)
+        self._timeInfo = {}
+        self._minTimeStamp = 0
+        self._set_timeInfo()
+
+    def _set_timeInfo(self, timeFileNamePrefix="time.txt", sed="."):
+        timeFileNameList = glob.glob(
+            os.path.join(self._timePath, timeFileNamePrefix, sed, "*"))
+        for timeFileName in timeFileNameList:
+            trainerId = int(timeFileName.split(sed)[-1])
+            gpuId = int(timeFileName.split(sed)[-2])
+            info = {}
+            with open(timeFileName, "r") as rf:
+                for line in rf:
+                    if line.startswith("start time:"):
+                        info["start_time"] = int(
+                            float(line.split(":")[-1]) * 1e9)
+
+                        self._minTimeStamp = min(self._minTimeStamp,
+                                                 info["start_time"])
+
+                    if line.startswith("end time:"):
+                        info["end_time"] = int(float(line.split(":")[-1]) * 1e9)
+            if not info:
+                self._timeInfo[gpuId * trainerId] = info
+
+    def _generateTraceFileByGroupAndGpuId(self, pipileInfo, netInfo, groupId,
+                                          gpuId):
+        dcgmInfoDict = self._dcgmFileReader.getDcgmInfoDict(groupId, gpuId)
+        opInfoDict = self._profileFileReader.getOpInfoDict(groupId, gpuId)
+
+        traceObj = {}
+        traceObj["traceEvents"] = pipileInfo[str(gpuId)] + opInfoDict[
+            "traceEvents"] + dcgmInfoDict["traceEvents"] + netInfo[
+                "traceEvents"]
+
+        self._profileFileReader.dumpDict(traceObj, "traceFile", groupId, gpuId,
+                                         False, self._saveFilePath)
+
+    def _generateTraceFileByGroup(self, groupId, processNum):
+        # first we need to generate pipeline info
+        pipileInfo = self._profileFileReader.getPipeLineInfo(groupId,
+                                                             processNum)
+        # second we need to generate dcgm info
+        dcgmInfo = self._dcgmFileReader.getDCGMTraceInfo(groupId, processNum)
+
+        # third we need to generate net info
+        netInfo = {}
+        netInfo["traceEvents"] = []
+        # netInfo = self._netFileReader.parseFileByGroup(groupId, processNum)
+
+        # forth we need to generate op info
+        opInfo = self._profileFileReader.getOPTraceInfo(groupId)
+
+        # finially we need dump this information into disk
+        processPool = []
+        pidList = []
+
+        for gpuId in range(self._gpuPerTrainer):
+            subproc = Process(
+                target=self._generateTraceFileByGroupAndGpuId,
+                args=(
+                    pipileInfo,
+                    netInfo,
+                    groupId,
+                    gpuId, ))
+            processPool.append(subproc)
+            subproc.start()
+            pidList.append(subproc.pid)
+            self._logger.info(
+                "[traceFile]: process [%d] has been started, total task num is %d ..."
+                % (subproc.pid, 1))
+
+        for t in processPool:
+            t.join()
+            pidList.remove(t.pid)
+            self._logger.info(
+                "[traceFile]: process [%d] has exited! remained %d process!" %
+                (t.pid, len(pidList)))
+
+    def generateTraceFile(self, processNum=8):
+        processPool = []
+        pidList = []
+        for groupId in range(self._trainerNum / self._groupSize):
+            subproc = Process(
+                target=self._generateTraceFileByGroup,
+                args=(
+                    groupId,
+                    processNum, ))
+            processPool.append(subproc)
+            subproc.start()
+            pidList.append(subproc.pid)
+            self._logger.info(
+                "[GroupTraceFile]: process [%d] has been started, total task num is %d ..."
+                % (subproc.pid, 1))
+        for t in processPool:
+            t.join()
+            pidList.remove(t.pid)
+            self._logger.info(
+                "[GroupTraceFile]: process [%d] has exited! remained %d process!"
+                % (t.pid, len(pidList)))
+
+
+if __name__ == '__main__':
+    args = get_argparse()
+    tl = CspReporter(args)
+    tl.generateTraceFile()
diff --git a/tools/CrossStackProfiler/DCGMFileReader.py b/tools/CrossStackProfiler/DCGMFileReader.py
new file mode 100755
index 00000000000000..599acb44c6556c
--- /dev/null
+++ b/tools/CrossStackProfiler/DCGMFileReader.py
@@ -0,0 +1,269 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import json
+import glob
+import logging
+import tempfile
+import argparse
+import pandas as pd
+import multiprocessing
+from multiprocessing import Process
+
+from CspChromeTraceFormatter import ChromeTraceFormatter
+
+from CspFileReader import FileReader
+from CspFileReader import getLogger
+from CspFileReader import dcgmMetricParameterMap
+from CspFileReader import TIME_PATH, DCGM_PATH, NET_PATH, PROFILE_PATH
+from CspFileReader import NETINFO_TRACE_NUM, DCGMINFO_TRACE_NUM, PIPELINEINFO_TRACE_NUM
+from CspFileReader import FILEORGANIZEFORM_BYRANK, FILEORGANIZEFORM_BYTRAINER, FILEORGANIZEFORM_BYOTHER, FILEORGANIZEFORM
+
+
+class dcgmFileReader(FileReader):
+    def parseFileByGroup(self, groupId, processNum=8):
+        fileFist = self.getFileListByGroup(groupId)
+        displaySize = min(self._displaySize, len(fileFist))
+        fileFist = fileFist[:displaySize]
+
+        if processNum == 0:
+            return self._parseTask(fileFist)
+
+        else:
+            self._logger.info("using [%d] process to do this work!" %
+                              processNum)
+            processPool = []
+            pidList = []
+
+            manager = multiprocessing.Manager()
+            q = manager.Queue()
+
+            taskList = self._splitTaskListForMultiProcess(fileFist, processNum)
+            for task in taskList:
+                subproc = Process(
+                    target=self._parseTask, args=(
+                        task,
+                        q, ))
+                processPool.append(subproc)
+                subproc.start()
+                pidList.append(subproc.pid)
+                self._logger.info(
+                    "[DCGM reader]: process [%d] has been started, total task num is %d ..."
+                    % (subproc.pid, len(processPool)))
+
+            for t in processPool:
+                t.join()
+                pidList.remove(t.pid)
+                self._logger.info(
+                    "[DCGM reader]: process [%d] has exited! remained %d process!"
+                    % (t.pid, len(pidList)))
+
+            isFistProcess = True
+            for t in processPool:
+                if isFistProcess:
+                    isFistProcess = False
+                    dcgm_data = q.get()
+                else:
+                    dcgm_data = pd.concat(
+                        [dcgm_data, q.get()], axis=0, join='outer')
+
+            return dcgm_data
+
+    def _parseTask(self, taskList, q=None):
+        is_first = True
+        for fileName in taskList:
+            self._logger.info("I am processing %s!" % fileName)
+            tmp_data = self._parseSingleFile(fileName)
+            if tmp_data is None:
+                continue
+
+            if is_first:
+                is_first = False
+                dcgm_data = tmp_data
+            else:
+                dcgm_data = pd.concat(
+                    [dcgm_data, tmp_data], axis=0, join='outer')
+        dcgm_data = dcgm_data.dropna()
+        if not q is None:
+            q.put(dcgm_data)
+        self._logger.info("I finish processing %s!" % fileName)
+        return dcgm_data
+
+    def _parseSingleFile(self, fileName):
+        trainerId = self.getTrainerId(fileName)
+
+        if not os.path.exists(fileName):
+            logging.warning(fileName + ' not found')
+            return
+
+        regex_list = [
+            (re.compile(r' +'), ','),
+            (re.compile(r'^,'), ''),
+        ]
+
+        csv_tempfile = tempfile.TemporaryFile()
+        with open(fileName, 'r') as fp:
+            has_header = False
+
+            for line in fp:
+                # skip `nvidia-dcgm-dmon.sh` init and fini info lines
+                if 'nv-hostengine' in line or 'dmon' in line or 'Host Engine Listener Started' in line:
+                    continue
+
+                if not line.strip().startswith("GPU") and not line.strip(
+                ).startswith("# Entity"):
+                    continue
+
+                # skip non-needed headers (only the header in 1th line was needed)
+                if line.strip().startswith("# Entity"):
+                    line = line.strip()[2:]
+
+                if 'Entity' == line[0:len('Entity')]:
+                    if has_header:
+                        continue
+                    else:
+                        has_header = True
+
+                if line.strip().startswith("GPU"):
+                    line = line.strip()[3:]
+
+                for r in regex_list:
+                    line = r[0].sub(r[1], line)
+
+                csv_tempfile.write(bytes(line + "\n"))
+
+        csv_tempfile.seek(0)
+
+        dcgm = pd.read_csv(csv_tempfile, header=0, delimiter=',')
+        # dcgm.info()
+        dcgm['FB_USED_RATIO'] = dcgm['FBUSD'] / dcgm['FBTTL']
+        dcgm['GPUTL'] = dcgm['GPUTL'] / 100.0
+        dcgm['ts'] = dcgm['TIMESTAMP'] * 1e9
+        dcgm['trainerId'] = trainerId
+
+        return dcgm
+
+    def _getDCGMTraceInfoByGpuId(self,
+                                 groupId,
+                                 gpuId,
+                                 dcgm_data,
+                                 pid_map,
+                                 q=None):
+        self._logger.info(
+            "Begin to generate dcgm info, groupId = %d, gpuID = %d ..." %
+            (groupId, gpuId))
+
+        gpuDcgmData = dcgm_data[dcgm_data['Entity'].isin([gpuId])]
+
+        traceEventList = []
+        for metric, parameteList in dcgmMetricParameterMap.items():
+            metaInfo = {}
+            metaInfo['name'] = 'process_name'
+            metaInfo['ph'] = 'M'
+            metaInfo['pid'] = pid_map[metric]
+            metaInfo['args'] = {'name': metric}
+            traceEventList.append(metaInfo)
+
+        for index, row in gpuDcgmData.iterrows():
+            for metric, parameteList in dcgmMetricParameterMap.items():
+                trainerId = int(row['trainerId']) % self._groupSize
+                if trainerId >= self._displaySize:
+                    continue
+
+                di = {}
+                # name = "%s_%d" % (metric, trainerId)
+                name = "%s" % (metric)
+                di['name'] = name
+                di['pid'] = pid_map[metric]
+                di['ts'] = self._align_ts(int(row['ts']))
+                # di['ts'] = int(row['ts'])
+                di['cat'] = metric
+                di['tid'] = "%d_%d" % (groupId, trainerId)
+                di['ph'] = "C"
+                di['id'] = trainerId
+
+                args = {}
+                for p in parameteList:
+                    args[p[0]] = row[p[1]]
+                di['args'] = args
+
+                traceEventList.append(di)
+        trace = {}
+        trace['traceEvents'] = traceEventList
+        self.dumpDCGMDict(trace, groupId, gpuId, True)
+
+        return trace
+
+    def getDCGMTraceInfo(self, groupId, processNum=8):
+        dcgm_data = self.parseFileByGroup(groupId, processNum)
+
+        pid_map = {}
+        init_pid = PIPELINEINFO_TRACE_NUM
+
+        for metric in dcgmMetricParameterMap.keys():
+            pid_map[metric] = init_pid
+            init_pid = init_pid + 1
+
+        manager = multiprocessing.Manager()
+        q = manager.Queue()
+        processPool = []
+        pidList = []
+
+        for gpuId in range(self._gpuPerTrainer):
+            subproc = Process(
+                target=self._getDCGMTraceInfoByGpuId,
+                args=(
+                    groupId,
+                    gpuId,
+                    dcgm_data,
+                    pid_map,
+                    q, ))
+            processPool.append(subproc)
+            subproc.start()
+            pidList.append(subproc.pid)
+            self._logger.info(
+                "[DCGM info]: process [%d] has been started, total task num is %d ..."
+                % (subproc.pid, 1))
+
+        for t in processPool:
+            t.join()
+            pidList.remove(t.pid)
+            self._logger.info(
+                "[DCGM info]: process [%d] has exited! remained %d process!" %
+                (t.pid, len(pidList)))
+
+        dcgmInfo = {}
+
+        return dcgmInfo
+
+
+def test_dcgmFileReader():
+    args = {
+        "dataPath": "data/newdata/dcgm",
+        "groupSize": 4,
+        "displaySize": 8,
+        "gpuPerTrainer": 8,
+        "minTimeStamp": 0,
+        "organizeForm": FILEORGANIZEFORM_BYTRAINER,
+    }
+
+    testReader = dcgmFileReader(getLogger(), args)
+    testReader.printArgs()
+    data = testReader.getDCGMTraceInfo(0, 8)
+
+
+if __name__ == "__main__":
+    test_dcgmFileReader()
diff --git a/tools/CrossStackProfiler/NetFileReader.py b/tools/CrossStackProfiler/NetFileReader.py
new file mode 100755
index 00000000000000..29c2ae85e60458
--- /dev/null
+++ b/tools/CrossStackProfiler/NetFileReader.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+import glob
+import logging
+import pandas as pd
+
+from multiprocessing import Process
+
+from CspChromeTraceFormatter import ChromeTraceFormatter
+
+from CspFileReader import FileReader
+from CspFileReader import getLogger
+from CspFileReader import TIME_PATH, DCGM_PATH, NET_PATH, PROFILE_PATH
+from CspFileReader import NETINFO_TRACE_NUM, DCGMINFO_TRACE_NUM, PIPELINEINFO_TRACE_NUM
+from CspFileReader import FILEORGANIZEFORM_BYRANK, FILEORGANIZEFORM_BYTRAINER, FILEORGANIZEFORM_BYOTHER, FILEORGANIZEFORM
+
+
+class netFileReader(FileReader):
+    def _parseSingleFile(self, fileNameList, tx_pid, rx_pid, q=None):
+
+        traceInfo = {}
+        traceEventList = []
+
+        metaInfo = {}
+        metaInfo['name'] = 'process_name'
+        metaInfo['ph'] = 'M'
+        metaInfo['pid'] = tx_pid
+        metaInfo['args'] = {'name': "%02d_tx" % tx_pid}
+
+        traceEventList.append(metaInfo)
+        metaInfo = {}
+        metaInfo['name'] = 'process_name'
+        metaInfo['ph'] = 'M'
+        metaInfo['pid'] = rx_pid
+        metaInfo['args'] = {'name': "%02d_rx" % rx_pid}
+
+        traceEventList.append(metaInfo)
+
+        trainerIdList = []
+        for fileName in fileNameList:
+            trainerId = self.getTrainerId(fileName)
+            trainerIdList.append(trainerId)
+            with open(fileName, "r") as rf:
+                for line in rf:
+                    try:
+                        event_str = json.loads(line.strip())
+                        event_str["pid"] = tx_pid if event_str[
+                            "name"] == "tx" else rx_pid
+                        # the unit of net is ms, we need ns
+                        event_str["ts"] = self._align_ts(event_str["ts"] * 1e6)
+                        event_str["id"] = trainerId
+                        traceEventList.append(event_str)
+
+                    except Exception:
+                        self._logger.warning(
+                            "invalid record [%s] in [%s]. skip it!" %
+                            (line[:-1], fileName))
+        traceInfo["traceEvents"] = traceEventList
+
+        if not q is None:
+            q.put(traceInfo)
+        else:
+            return traceInfo
+
+    def parseFileByGroup(self, groupId, processNum=8):
+        fileFist = self.getFileListByGroup(groupId)
+        fileFist = fileFist[:min(self._displaySize, len(fileFist))]
+
+        manager = multiprocessing.Manager()
+        q = manager.Queue()
+
+        processPool = []
+        pidList = []
+        tx_pid = PIPELINEINFO_TRACE_NUM
+        rx_pid = PIPELINEINFO_TRACE_NUM + 1
+
+        taskList = self._splitTaskListForMultiProcess(fileFist, processNum)
+        for task in taskList:
+            subproc = Process(
+                target=self._parseSingleFile, args=(
+                    task,
+                    tx_pid,
+                    rx_pid,
+                    q, ))
+            processPool.append(subproc)
+            subproc.start()
+            pidList.append(subproc.pid)
+            self._logger.info(
+                "[Net info]: process [%d] has been started, total task num is %d ..."
+                % (subproc.pid, len(processPool)))
+
+        for t in processPool:
+            t.join()
+            pidList.remove(t.pid)
+            self._logger.info(
+                "[Net info]: process [%d] has exited! remained %d process!" %
+                (t.pid, len(pidList)))
+
+        traceInfo = {}
+        isFistProcess = True
+        for t in processPool:
+            if isFistProcess:
+                isFistProcess = False
+                traceInfo["traceEvents"] = q.get()["traceEvents"]
+            else:
+                traceInfo["traceEvents"].extend(q.get()["traceEvents"])
+
+        return traceInfo
+
+
+def test_netFileReader():
+    args = {
+        "dataPath": "data/newdata/net",
+        "groupSize": 4,
+        "displaySize": 2,
+        "gpuPerTrainer": 8,
+        "minTimeStamp": 0,
+        "organizeForm": FILEORGANIZEFORM_BYTRAINER,
+    }
+
+    testReader = netFileReader(getLogger(), args)
+    testReader.printArgs()
+    data = testReader.parseFileByGroup(0, 8)
+
+    jsObj = json.dumps(data, indent=4, separators=(',', ': '))
+    fileObject = open('jsonFile.json', 'w')
+    fileObject.write(jsObj)
+    fileObject.close()
+
+
+if __name__ == "__main__":
+    test_netFileReader()
diff --git a/tools/CrossStackProfiler/ProfileFileReader.py b/tools/CrossStackProfiler/ProfileFileReader.py
new file mode 100755
index 00000000000000..0f3299ef5473fa
--- /dev/null
+++ b/tools/CrossStackProfiler/ProfileFileReader.py
@@ -0,0 +1,480 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import six
+import glob
+import json
+import logging
+import argparse
+import pandas as pd
+import multiprocessing
+from multiprocessing import Process
+
+import google.protobuf.text_format as text_format
+import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
+
+from CspChromeTraceFormatter import ChromeTraceFormatter
+
+from CspFileReader import FileReader
+from CspFileReader import getLogger
+from CspFileReader import TIME_PATH, DCGM_PATH, NET_PATH, PROFILE_PATH
+from CspFileReader import NETINFO_TRACE_NUM, DCGMINFO_TRACE_NUM, PIPELINEINFO_TRACE_NUM
+from CspFileReader import FILEORGANIZEFORM_BYRANK, FILEORGANIZEFORM_BYTRAINER, FILEORGANIZEFORM_BYOTHER, FILEORGANIZEFORM
+
+
+class profileFileReader(FileReader):
+    def _parseSingleFile(self, profile):
+        with open(profile, 'rb') as f:
+            profile_s = f.read()
+            profile_pb = profiler_pb2.Profile()
+            profile_pb.ParseFromString(profile_s)
+
+            return profile_pb
+
+    def _parseTask(self, taskList, q=None):
+        profile_dict = {}
+
+        for fileName in taskList:
+            rankId = self.getRankId(fileName)
+            profile_dict["trainerRank.%03d" %
+                         (rankId)] = self._parseSingleFile(fileName)
+            self._logger.info("I finish processing %s!" % fileName)
+
+        if not q is None:
+            q.put(profile_dict)
+
+        return profile_dict
+
+    def _is_forwardBackwardInfo(self, items):
+        if items["name"] == "marker/compute/MarkerCUDA":
+            if items.has_key("args"):
+                if isinstance(items["args"], dict):
+                    args = items["args"]
+                    if args.has_key("detail_info"):
+                        if args["detail_info"] == "marker_forward_B" or \
+                           args["detail_info"] == "marker_forward_E" or \
+                           args["detail_info"] == "marker_backward_B" or \
+                           args["detail_info"] == "marker_backward_E":
+                            return True
+        return False
+
+    def _allocate_forwardBackwardInfo(self, restList, pid, tid):
+        def _cmp_ele(items):
+            return items["ts"]
+
+        restList.sort(key=_cmp_ele)
+        newList = []
+
+        lastEle = {}
+        for items in restList:
+            if items["args"]["detail_info"].endswith("E"):
+                if not lastEle:
+                    continue
+                else:
+                    lastEle["dur"] = items["ts"] - lastEle["ts"]
+                    name = lastEle["args"]["detail_info"]
+                    name = name[:name.rfind('_')]
+                    name = name.split('_')[1]
+                    lastEle["name"] = name
+                    lastEle["args"]["detail_info"] = name
+                    lastEle["args"]["name"] = name
+                    if name == "backward":
+                        lastEle["cname"] = "good"
+                    else:
+                        lastEle["cname"] = "bad"
+
+                    lastEle["tid"] = tid
+                    lastEle["pid"] = pid
+
+                    newList.append(lastEle)
+            else:
+                lastEle = items
+
+        return newList
+
+    def _getPipeLineInfo(self, profileList, q=None):
+
+        res = {}
+        for profile in profileList:
+            rankId = self.getRankId(profile)
+
+            profile_pb = self._parseSingleFile(profile)
+            traceEventList = []
+            pid = 0
+            tid = rankId
+
+            for event in profile_pb.events:
+                args = {'name': event.name}
+                if event.memcopy.bytes > 0:
+                    args['mem_bytes'] = event.memcopy.bytes
+                if hasattr(event, "detail_info") and event.detail_info:
+                    args['detail_info'] = event.detail_info
+
+                traceEvent = {}
+                traceEvent['ph'] = 'X'
+                traceEvent['cat'] = 'Op'
+                traceEvent['name'] = event.name
+                traceEvent['pid'] = pid
+                traceEvent['tid'] = tid
+                traceEvent['ts'] = self._align_ts(event.start_ns)
+                traceEvent['dur'] = (event.end_ns - event.start_ns) / 1.0
+                traceEvent['args'] = args
+
+                if self._is_forwardBackwardInfo(traceEvent):
+                    traceEventList.append(traceEvent)
+
+            pipeLineList = self._allocate_forwardBackwardInfo(traceEventList,
+                                                              pid, tid)
+
+            res[str(rankId)] = pipeLineList
+
+        if not q is None:
+            q.put(res)
+
+        return res
+
+    def getPipeLineInfo(self, groupId, processNum=8):
+        fileFist = self.getFileListByGroup(groupId)
+
+        self._logger.info(
+            "using [%d] process to do this work, total task num is %d!" %
+            (processNum, len(fileFist)))
+        processPool = []
+        pidList = []
+
+        manager = multiprocessing.Manager()
+        q = manager.Queue()
+
+        taskList = self._splitTaskListForMultiProcess(fileFist, processNum)
+        for task in taskList:
+            subproc = Process(
+                target=self._getPipeLineInfo, args=(
+                    task,
+                    q, ))
+            processPool.append(subproc)
+            subproc.start()
+            pidList.append(subproc.pid)
+            self._logger.info(
+                "[pipeline info]: process [%d] has been started, total task num is %d ..."
+                % (subproc.pid, len(task)))
+
+        for t in processPool:
+            t.join()
+            pidList.remove(t.pid)
+            self._logger.info(
+                "[pipeline info]: process [%d] has exited! remained %d process!"
+                % (t.pid, len(pidList)))
+
+        pipeLineInfo = {}
+
+        metaInfo = {}
+        metaInfo['name'] = 'process_name'
+        metaInfo['ph'] = 'M'
+        metaInfo['pid'] = 0
+        metaInfo['args'] = {
+            'name': "%02d_pipeLineInfo" % PIPELINEINFO_TRACE_NUM
+        }
+
+        for t in processPool:
+            for k, v in q.get().items():
+                rankId = int(k)
+                gpuId = rankId % self._gpuPerTrainer
+                if str(gpuId) not in pipeLineInfo.keys():
+                    pipeLineInfo[str(gpuId)] = [metaInfo]
+                pipeLineInfo[str(gpuId)].extend(v)
+
+        return pipeLineInfo
+
+    def _allocate_pids(self, profile_dict, gpuId, initPid):
+        chrome_trace = ChromeTraceFormatter()
+        devices = dict()
+        mem_devices = dict()
+
+        initLineNum = initPid + 1
+        lineDelta = len(profile_dict.keys())
+        i = 0
+        for k, profile_pb in six.iteritems(profile_dict):
+            lineNum = initLineNum
+            for event in profile_pb.events:
+                if event.type == profiler_pb2.Event.CPU:
+                    if (k, event.device_id, "CPU") not in devices:
+                        pid = initPid
+                        initPid = initPid + 1
+                        devices[(k, event.device_id, "CPU")] = pid
+                        # -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
+                        if event.device_id == -1:
+                            chrome_trace.emit_pid("%02d_%s:cuda_api" %
+                                                  (lineNum, k), pid)
+                            lineNum = lineNum + 1
+                        else:
+                            chrome_trace.emit_pid("%02d_%s:cpu:block:%d" %
+                                                  (lineNum, k, event.device_id),
+                                                  pid)
+                            lineNum = lineNum + 1
+                elif event.type == profiler_pb2.Event.GPUKernel:
+                    if (k, event.device_id, "GPUKernel") not in devices:
+                        if gpuId == event.device_id:
+                            pid = initPid
+                            initPid = initPid + 1
+
+                            devices[(k, event.device_id, "GPUKernel")] = pid
+                            chrome_trace.emit_pid("%02d_%s:gpu:%d" %
+                                                  (lineNum, k, event.device_id),
+                                                  pid)
+                            lineNum = lineNum + 1
+
+            if not hasattr(profile_pb, "mem_events"):
+                continue
+            for mevent in profile_pb.mem_events:
+                if mevent.place == profiler_pb2.MemEvent.CUDAPlace:
+                    if (k, mevent.device_id, "GPU") not in mem_devices:
+                        if gpuId == mevent.device_id:
+                            pid = initPid
+                            initPid = initPid + 1
+
+                            mem_devices[(k, mevent.device_id, "GPU")] = pid
+                            chrome_trace.emit_pid(
+                                "%02d_memory usage on %s:gpu:%d" %
+                                (lineNum, k, mevent.device_id), pid)
+                            lineNum = lineNum + 1
+                elif mevent.place == profiler_pb2.MemEvent.CPUPlace:
+                    if (k, mevent.device_id, "CPU") not in mem_devices:
+                        pid = initPid
+                        initPid = initPid + 1
+
+                        mem_devices[(k, mevent.device_id, "CPU")] = pid
+                        chrome_trace.emit_pid("%02d_memory usage on %s:cpu:%d" %
+                                              (lineNum, k, mevent.device_id),
+                                              pid)
+                        lineNum = lineNum + 1
+                elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace:
+                    if (k, mevent.device_id, "CUDAPinnedPlace"
+                        ) not in mem_devices:
+                        if gpuId == mevent.device_id:
+                            pid = initPid
+                            initPid = initPid + 1
+
+                            mem_devices[(k, mevent.device_id,
+                                         "CUDAPinnedPlace")] = pid
+                            chrome_trace.emit_pid(
+                                "%02d_memory usage on %s:cudapinnedplace:%d" %
+                                (lineNum, k, mevent.device_id), pid)
+                            lineNum = lineNum + 1
+                if (k, 0, "CPU") not in mem_devices:
+                    pid = initPid
+                    initPid = initPid + 1
+
+                    mem_devices[(k, 0, "CPU")] = pid
+                    chrome_trace.emit_pid("%02d_memory usage on %s:cpu:%d" %
+                                          (lineNum, k, 0), pid)
+                    lineNum = lineNum + 1
+                if (k, 0, "GPU") not in mem_devices:
+                    # if gpuId == mevent.device_id:
+                    pid = initPid
+                    initPid = initPid + 1
+
+                    mem_devices[(k, 0, "GPU")] = pid
+                    chrome_trace.emit_pid("%02d_memory usage on %s:gpu:%d" %
+                                          (lineNum, k, 0), pid)
+                    lineNum = lineNum + 1
+                if (k, 0, "CUDAPinnedPlace") not in mem_devices:
+                    pid = initPid
+                    initPid = initPid + 1
+
+                    mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
+                    chrome_trace.emit_pid(
+                        "%02d_memory usage on %s:cudapinnedplace:%d" %
+                        (lineNum, k, 0), pid)
+                    lineNum = lineNum + 1
+            i = i + 1
+        return chrome_trace, devices, mem_devices
+
+    def _allocate_events(self, profile_dict, devices, gpuId):
+        chrome_trace = ChromeTraceFormatter()
+        for k, profile_pb in six.iteritems(profile_dict):
+
+            rankId = int(k.split(".")[-1])
+
+            for event in profile_pb.events:
+                if event.type == profiler_pb2.Event.CPU:
+                    type = "CPU"
+                elif event.type == profiler_pb2.Event.GPUKernel:
+                    type = "GPUKernel"
+
+                if event.type == profiler_pb2.Event.GPUKernel and event.device_id != gpuId and rankId % self._gpuPerTrainer != gpuId:
+                    continue
+
+                pid = devices[(k, event.device_id, type)]
+                args = {'name': event.name}
+                if event.memcopy.bytes > 0:
+                    args['mem_bytes'] = event.memcopy.bytes
+                if hasattr(event, "detail_info") and event.detail_info:
+                    args['detail_info'] = event.detail_info
+                # TODO(panyx0718): Chrome tracing only handles ms. However, some
+                # ops takes micro-seconds. Hence, we keep the ns here.
+                chrome_trace.emit_region(
+                    self._align_ts(event.start_ns),
+                    (event.end_ns - event.start_ns) / 1.0, pid,
+                    event.sub_device_id, 'Op', event.name, args)
+        return chrome_trace
+
+    def _allocate_memory_event(self, profile_dict, mem_devices, gpuId):
+        chrome_trace = ChromeTraceFormatter()
+        if not hasattr(profiler_pb2, "MemEvent"):
+            return
+        place_to_str = {
+            profiler_pb2.MemEvent.CPUPlace: "CPU",
+            profiler_pb2.MemEvent.CUDAPlace: "GPU",
+            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
+        }
+        for k, profile_pb in six.iteritems(profile_dict):
+            rankId = int(k.split(".")[-1])
+
+            trainerId = rankId / self._gpuPerTrainer
+
+            if trainerId >= self._displaySize:
+                continue
+
+            mem_list = []
+            end_profiler = 0
+            for mevent in profile_pb.mem_events:
+                crt_info = dict()
+                crt_info['time'] = mevent.start_ns
+                crt_info['size'] = mevent.bytes
+                if mevent.place in place_to_str:
+                    place = place_to_str[mevent.place]
+                else:
+                    place = "UnDefine"
+
+                if (mevent.place == profiler_pb2.MemEvent.CUDAPlace or
+                        mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace
+                    ) and mevent.device_id != gpuId:
+                    continue
+
+                crt_info['place'] = place
+                pid = mem_devices[(k, mevent.device_id, place)]
+                crt_info['pid'] = pid
+                crt_info['thread_id'] = mevent.thread_id
+                crt_info['device_id'] = mevent.device_id
+                mem_list.append(crt_info)
+                crt_info = dict()
+                crt_info['place'] = place
+                crt_info['pid'] = pid
+                crt_info['thread_id'] = mevent.thread_id
+                crt_info['device_id'] = mevent.device_id
+                crt_info['time'] = mevent.end_ns
+                crt_info['size'] = -mevent.bytes
+                mem_list.append(crt_info)
+                end_profiler = max(end_profiler, crt_info['time'])
+            mem_list.sort(key=lambda tmp: (tmp.get('time', 0)))
+            i = 0
+            total_size = 0
+            while i < len(mem_list):
+                total_size += mem_list[i]['size']
+                while i < len(mem_list) - 1 and mem_list[i]['time'] == mem_list[
+                        i + 1]['time']:
+                    total_size += mem_list[i + 1]['size']
+                    i += 1
+
+                chrome_trace.emit_counter(
+                    "Memory", "Memory", mem_list[i]['pid'],
+                    self._align_ts(mem_list[i]['time']), 0, total_size)
+                i += 1
+        return chrome_trace
+
+    def _getOPTraceInfoByGpuId(self, groupId, gpuId):
+        fileFist = self.getFileListByGroup(groupId)
+        newFileList = []
+        for file in fileFist:
+            rankId = self.getRankId(file)
+            localRank = rankId % self._gpuPerTrainer
+            if localRank == gpuId and (rankId / self._gpuPerTrainer
+                                       ) % self._groupSize < self._displaySize:
+                newFileList.append(file)
+
+        profile_dict = self._parseTask(newFileList)
+        initPid = PIPELINEINFO_TRACE_NUM + DCGMINFO_TRACE_NUM + NETINFO_TRACE_NUM
+        metaTrace, devicesPid, mem_devicesPid = self._allocate_pids(
+            profile_dict, gpuId, initPid)
+        eventsTrace = self._allocate_events(profile_dict, devicesPid, gpuId)
+        memEventsTrace = self._allocate_memory_event(profile_dict,
+                                                     mem_devicesPid, gpuId)
+
+        trace = {}
+        trace[
+            'traceEvents'] = metaTrace._metadata + eventsTrace._events + memEventsTrace._events
+        self.dumpOpInfoDict(trace, groupId, gpuId, True)
+
+        return trace
+
+    def getOPTraceInfo(self, groupId):
+        manager = multiprocessing.Manager()
+        q = manager.Queue()
+        processPool = []
+        pidList = []
+
+        for gpuId in range(self._gpuPerTrainer):
+            subproc = Process(
+                target=self._getOPTraceInfoByGpuId, args=(
+                    groupId,
+                    gpuId, ))
+            processPool.append(subproc)
+            subproc.start()
+            pidList.append(subproc.pid)
+            self._logger.info(
+                "[op info]: process [%d] has been started, total task num is %d ..."
+                % (subproc.pid, 1))
+
+        for t in processPool:
+            t.join()
+            pidList.remove(t.pid)
+            self._logger.info(
+                "[op info]: process [%d] has exited! remained %d process!" %
+                (t.pid, len(pidList)))
+
+        opInfo = {}
+
+        return opInfo
+
+    def parseFileByGroup(self, groupId, processNum=8):
+        fileFist = self.getFileListByGroup(groupId)
+        if processNum == 0:
+            return self._parseTask(fileFist)
+        else:
+            return self._parseTask(fileFist)
+
+
+def test_profileFileReader():
+    args = {
+        "dataPath": "data/newdata/profile",
+        "groupSize": 4,
+        "displaySize": 8,
+        "gpuPerTrainer": 8,
+        "minTimeStamp": 0,
+        "organizeForm": FILEORGANIZEFORM_BYRANK,
+    }
+
+    testReader = profileFileReader(getLogger(), args)
+    testReader.printArgs()
+    data = testReader.getOPTraceInfo(0)
+
+    jsObj = json.dumps(data)
+    fileObject = open('jsonFile.json', 'w')
+    fileObject.write(jsObj)
+    fileObject.close()
+
+
+if __name__ == "__main__":
+    test_profileFileReader()
diff --git a/tools/CrossStackProfiler/__init__.py b/tools/CrossStackProfiler/__init__.py
new file mode 100755
index 00000000000000..6f0ea85344b7e0
--- /dev/null
+++ b/tools/CrossStackProfiler/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 78d9978c4bc45e..93337978393498 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -112,7 +112,7 @@ def __urlretrieve(self, url, filename):
                 print(e)
                 print(
                     'PREC download {} error, retry {} time(s) after {} secs.[proxy_option={}]'.
-                    format(url, ix, ix * 10, proxy))
+                    format(url, ix, ix * 10, cur_proxy))
                 continue
             else:
                 return True
@@ -179,7 +179,7 @@ def __get_comment_by_prog(self, content, prog):
     def get_comment_of_file(self, f):
         #content = self.repo.get_contents(f.replace(PADDLE_ROOT, ''), 'pull/').decoded_content
         #todo: get file from github
-        with open(f) as fd:
+        with open(f, encoding="utf-8") as fd:
             lines = fd.readlines()
         lineno = 1
         inputs = ''
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 70d7fb98cb5387..fbc0b767eff44f 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -634,9 +634,6 @@
     'test_analyzer_bert',
     'test_analyzer_googlenet',
     'test_fleet_base',
-    'test_imperative_container_layerdict',
-    'test_set_value_op',
-    'test_view_op_reuse_allocation',
     'test_sequential',
     'test_sequential',
     'test_imperative_layers',
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index 6de9d84379fea5..3fa9e9b782c1ae 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -27,11 +27,25 @@
 import hashlib
 import platform
 import functools
+import pkgutil
+import logging
+import paddle
 
 member_dict = collections.OrderedDict()
 
 visited_modules = set()
 
+logger = logging.getLogger()
+if logger.handlers:
+    # we assume the first handler is the one we want to configure
+    console = logger.handlers[0]
+else:
+    console = logging.StreamHandler(sys.stderr)
+    logger.addHandler(console)
+console.setFormatter(
+    logging.Formatter(
+        "%(asctime)s - %(funcName)s:%(lineno)d - %(levelname)s - %(message)s"))
+
 
 def md5(doc):
     try:
@@ -199,11 +213,124 @@ def visit_all_module(mod):
                 visit_member(mod.__name__, instance)
 
 
+# all from gen_doc.py
+api_info_dict = {}  # used by get_all_api
+
+
+# step 1: walkthrough the paddle package to collect all the apis in api_set
+def get_all_api(root_path='paddle', attr="__all__"):
+    """
+    walk through the paddle package to collect all the apis.
+    """
+    global api_info_dict
+    api_counter = 0
+    for filefinder, name, ispkg in pkgutil.walk_packages(
+            path=paddle.__path__, prefix=paddle.__name__ + '.'):
+        try:
+            if name in sys.modules:
+                m = sys.modules[name]
+            else:
+                # importlib.import_module(name)
+                m = eval(name)
+                continue
+        except AttributeError:
+            logger.warning("AttributeError occurred when `eval(%s)`", name)
+            pass
+        else:
+            api_counter += process_module(m, attr)
+
+    api_counter += process_module(paddle, attr)
+
+    logger.info('%s: collected %d apis, %d distinct apis.', attr, api_counter,
+                len(api_info_dict))
+
+    return [api_info['all_names'][0] for api_info in api_info_dict.values()]
+
+
+def insert_api_into_dict(full_name, gen_doc_anno=None):
+    """
+    insert add api into the api_info_dict
+    Return:
+        api_info object or None
+    """
+    try:
+        obj = eval(full_name)
+        fc_id = id(obj)
+    except AttributeError:
+        logger.warning("AttributeError occurred when `id(eval(%s))`", full_name)
+        return None
+    except:
+        logger.warning("Exception occurred when `id(eval(%s))`", full_name)
+        return None
+    else:
+        logger.debug("adding %s to api_info_dict.", full_name)
+        if fc_id in api_info_dict:
+            api_info_dict[fc_id]["all_names"].add(full_name)
+        else:
+            api_info_dict[fc_id] = {
+                "all_names": set([full_name]),
+                "id": fc_id,
+                "object": obj,
+                "type": type(obj).__name__,
+            }
+            docstr = inspect.getdoc(obj)
+            if docstr:
+                api_info_dict[fc_id]["docstring"] = inspect.cleandoc(docstr)
+            if gen_doc_anno:
+                api_info_dict[fc_id]["gen_doc_anno"] = gen_doc_anno
+        return api_info_dict[fc_id]
+
+
+# step 1 fill field : `id` & `all_names`, type, docstring
+def process_module(m, attr="__all__"):
+    api_counter = 0
+    if hasattr(m, attr):
+        # may have duplication of api
+        for api in set(getattr(m, attr)):
+            if api[0] == '_': continue
+            # Exception occurred when `id(eval(paddle.dataset.conll05.test, get_dict))`
+            if ',' in api: continue
+
+            # api's fullname
+            full_name = m.__name__ + "." + api
+            api_info = insert_api_into_dict(full_name)
+            if api_info is not None:
+                api_counter += 1
+                if inspect.isclass(api_info['object']):
+                    for name, value in inspect.getmembers(api_info['object']):
+                        if (not name.startswith("_")) and hasattr(value,
+                                                                  '__name__'):
+                            method_full_name = full_name + '.' + name  # value.__name__
+                            method_api_info = insert_api_into_dict(
+                                method_full_name, 'class_method')
+                            if method_api_info is not None:
+                                api_counter += 1
+    return api_counter
+
+
+def get_all_api_from_modulelist():
+    modulelist = [
+        paddle, paddle.amp, paddle.nn, paddle.nn.functional,
+        paddle.nn.initializer, paddle.nn.utils, paddle.static, paddle.static.nn,
+        paddle.io, paddle.jit, paddle.metric, paddle.distribution,
+        paddle.optimizer, paddle.optimizer.lr, paddle.regularizer, paddle.text,
+        paddle.utils, paddle.utils.download, paddle.utils.profiler,
+        paddle.utils.cpp_extension, paddle.sysconfig, paddle.vision,
+        paddle.distributed, paddle.distributed.fleet,
+        paddle.distributed.fleet.utils, paddle.distributed.parallel,
+        paddle.distributed.utils, paddle.callbacks, paddle.hub, paddle.autograd
+    ]
+    for m in modulelist:
+        visit_all_module(m)
+
+    return member_dict
+
+
 if __name__ == '__main__':
-    import paddle
-    modules = sys.argv[1].split(",")
-    for m in modules:
-        visit_all_module(importlib.import_module(m))
+    # modules = sys.argv[1].split(",")
+    # for m in modules:
+    #    visit_all_module(importlib.import_module(m))
+    get_all_api_from_modulelist()
 
     for name in member_dict:
         print(name, member_dict[name])
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index a1658e3c2edf79..0ac6c929c5d758 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -39,14 +39,13 @@
     console = logger.handlers[
         0]  # we assume the first handler is the one we want to configure
 else:
-    console = logging.StreamHandler()
+    console = logging.StreamHandler(stream=sys.stderr)
     logger.addHandler(console)
 console.setFormatter(logging.Formatter("%(message)s"))
 
 RUN_ON_DEVICE = 'cpu'
 SAMPLE_CODE_TEST_CAPACITY = set()
 GPU_ID = 0
-methods = []
 whl_error = []
 API_DEV_SPEC_FN = 'paddle/fluid/API_DEV.spec'
 API_PR_SPEC_FN = 'paddle/fluid/API_PR.spec'
@@ -247,13 +246,15 @@ def is_required_match(requirestr, cbtitle='not-specified'):
         False - not match
         None - skipped  # trick
     """
-    global SAMPLE_CODE_TEST_CAPACITY  # readonly
+    global SAMPLE_CODE_TEST_CAPACITY, RUN_ON_DEVICE  # readonly
     requires = set(['cpu'])
     if requirestr:
         for r in requirestr.split(','):
             rr = r.strip().lower()
             if rr:
                 requires.add(rr)
+    else:
+        requires.add(RUN_ON_DEVICE)
     if 'skip' in requires or 'skiptest' in requires:
         logger.info('%s: skipped', cbtitle)
         return None
@@ -283,8 +284,8 @@ def insert_codes_into_codeblock(codeblock, apiname='not-specified'):
         cpu_str = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = ""\n'
         gpu_str = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = "{}"\n'.format(
             GPU_ID)
-        if 'required' in codeblock:
-            if codeblock['required'] is None or codeblock['required'] == 'cpu':
+        if 'required' in codeblock and codeblock['required']:
+            if codeblock['required'] == 'cpu':
                 inserted_codes_f = cpu_str
             elif codeblock['required'] == 'gpu':
                 inserted_codes_f = gpu_str
@@ -426,20 +427,25 @@ def execute_samplecode(tfname):
     return result, tfname, msg, end_time - start_time
 
 
-def get_filenames():
+def get_filenames(full_test=False):
     '''
     this function will get the sample code files that pending for check.
 
+    Args:
+        full_test: the full apis or the increment
+
     Returns:
 
         dict: the sample code files pending for check .
 
     '''
-    global methods  # write
     global whl_error
     import paddle
     whl_error = []
-    get_incrementapi()
+    if full_test:
+        get_full_api()
+    else:
+        get_incrementapi()
     all_sample_code_filenames = {}
     with open(API_DIFF_SPEC_FN) as f:
         for line in f.readlines():
@@ -472,8 +478,9 @@ def get_api_md5(path):
         api_md5(dict): key is the api's real fullname, value is the md5sum.
     """
     api_md5 = {}
-    API_spec = '%s/%s' % (os.path.abspath(os.path.join(os.getcwd(), "..")),
-                          path)
+    API_spec = os.path.abspath(os.path.join(os.getcwd(), "..", path))
+    if not os.path.isfile(API_spec):
+        return api_md5
     pat = re.compile(r'\((paddle[^,]+)\W*document\W*([0-9a-z]{32})')
     patArgSpec = re.compile(
         r'^(paddle[^,]+)\s+\(ArgSpec.*document\W*([0-9a-z]{32})')
@@ -487,6 +494,28 @@ def get_api_md5(path):
     return api_md5
 
 
+def get_full_api():
+    """
+    get all the apis
+    """
+    global API_DIFF_SPEC_FN  ## readonly
+    from print_signatures import get_all_api_from_modulelist
+    member_dict = get_all_api_from_modulelist()
+    with open(API_DIFF_SPEC_FN, 'w') as f:
+        f.write("\n".join(member_dict.keys()))
+
+
+def get_full_api_by_walk():
+    """
+    get all the apis
+    """
+    global API_DIFF_SPEC_FN  ## readonly
+    from print_signatures import get_all_api
+    apilist = get_all_api()
+    with open(API_DIFF_SPEC_FN, 'w') as f:
+        f.write("\n".join(apilist))
+
+
 def get_incrementapi():
     '''
     this function will get the apis that difference between API_DEV.spec and API_PR.spec.
@@ -526,6 +555,7 @@ def parse_args():
     #                     help='Use CPU mode (overrides --gpu)')
     # parser.add_argument('--gpu', dest='gpu_mode', action="store_true")
     parser.add_argument('--debug', dest='debug', action="store_true")
+    parser.add_argument('--full-test', dest='full_test', action="store_true")
     parser.add_argument('mode', type=str, help='run on device', default='cpu')
     for item in arguments:
         parser.add_argument(
@@ -545,6 +575,8 @@ def parse_args():
     args = parse_args()
     if args.debug:
         logger.setLevel(logging.DEBUG)
+    else:
+        logger.setLevel(logging.INFO)
     if args.logf:
         logfHandler = logging.FileHandler(args.logf)
         logfHandler.setFormatter(
@@ -573,7 +605,7 @@ def parse_args():
     else:
         os.mkdir(SAMPLECODE_TEMPDIR)
 
-    filenames = get_filenames()
+    filenames = get_filenames(args.full_test)
     if len(filenames) == 0 and len(whl_error) == 0:
         logger.info("-----API_PR.spec is the same as API_DEV.spec-----")
         exit(0)
@@ -593,6 +625,8 @@ def parse_args():
     if not args.debug:
         shutil.rmtree(SAMPLECODE_TEMPDIR)
 
+    stdout_handler = logging.StreamHandler(stream=sys.stdout)
+    logger.addHandler(stdout_handler)
     logger.info("----------------End of the Check--------------------")
     if len(whl_error) != 0:
         logger.info("%s is not in whl.", whl_error)
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index bc0c5af4d72a29..d1e4680e63f95e 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -711,4 +711,5 @@
     'test_model_cast_to_bf16',
     'test_sgd_op_bf16',
     'test_marker_op',
+    'test_c_embedding_op',
 ]
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index 68d7ef336edba6..4dbacbaa59a5da 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -195,7 +195,7 @@ if [ ${WITH_GPU:-OFF} == "ON" ];then
     num=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d' | wc -l)
     echo "Windows 1 card TestCases count is $num"
     if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
-        python ${PADDLE_ROOT}/tools/get_pr_ut.py
+        python ${PADDLE_ROOT}/tools/get_pr_ut.py || echo "Failed to obtain ut_list !"
         if [[ -f "ut_list" ]]; then
             echo "PREC length: "`wc -l ut_list`
             precision_cases=`cat ut_list`