diff --git a/.github/ISSUE_TEMPLATE/---document-issue-.md b/.github/ISSUE_TEMPLATE/---document-issue-.md
index 7c464ac584bc87..ffc2fcd7817b64 100644
--- a/.github/ISSUE_TEMPLATE/---document-issue-.md
+++ b/.github/ISSUE_TEMPLATE/---document-issue-.md
@@ -56,4 +56,4 @@ For example: no sample code; The sample code is not helpful; The sample code not
 For example:Chinese API in this doc is inconsistent with English API, including params, description, sample code, formula, etc.
 
 #### Other
-For example: The doc link is broken; The doc page is missing; Dead link in docs.
\ No newline at end of file
+For example: The doc link is broken; The doc page is missing; Dead link in docs.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d2f613eff5c0c..59bc768aa41e1a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,7 @@ option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
 option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
+option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
 if (WITH_GPU  AND WITH_XPU)
     message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
@@ -57,6 +58,9 @@ if(WITH_MUSL)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
 endif()
 
+if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+    set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+endif()
 
 if(WIN32)
     option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 9c1bd52e7fb7df..2a1e6897c02e44 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -93,13 +93,18 @@ if(WITH_GPU)
 
     FIND_PACKAGE(CUDA REQUIRED)
 
-    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile")
+    if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 10.1)
+        message(FATAL_ERROR "Paddle needs CUDA >= 10.1 to compile")
     endif()
 
     if(NOT CUDNN_FOUND)
         message(FATAL_ERROR "Paddle needs cudnn to compile")
     endif()
+
+    if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+        message(FATAL_ERROR "Paddle needs CUDNN >= 7.0 to compile")
+    endif()
+
     if(CUPTI_FOUND)
         include_directories(${CUPTI_INCLUDE_DIR})
         add_definitions(-DPADDLE_WITH_CUPTI)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 05b55952074429..7f2addb02d36dd 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -6,15 +6,9 @@ endif()
 if (WITH_NV_JETSON)
   add_definitions(-DWITH_NV_JETSON)
   set(paddle_known_gpu_archs "53 62 72")
-  set(paddle_known_gpu_archs7 "53")
-  set(paddle_known_gpu_archs8 "53 62")
-  set(paddle_known_gpu_archs9 "53 62")
   set(paddle_known_gpu_archs10 "53 62 72")
 else()
-  set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
-  set(paddle_known_gpu_archs7 "30 35 50 52")
-  set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
-  set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70")
+  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80")
   set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
   set(paddle_known_gpu_archs11 "52 60 61 70 75 80")
 endif()
@@ -160,25 +154,7 @@ function(select_nvcc_arch_flags out_variable)
 endfunction()
 
 message(STATUS "CUDA detected: " ${CMAKE_CUDA_COMPILER_VERSION})
-if (${CMAKE_CUDA_COMPILER_VERSION} LESS 7.0)
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 8.0) # CUDA 7.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 9.0) # CUDA 8.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
-  # warning for now.
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) # CUDA 9.x
-  set(paddle_known_gpu_archs ${paddle_known_gpu_archs9})
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets")
-elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
+if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x
   set(paddle_known_gpu_archs ${paddle_known_gpu_archs10})
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
index bcf0c0a0646fc3..a0b6f480f95ae7 100644
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -12,50 +12,47 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
-
-SET(ASCEND_PROJECT       "extern_ascend")
-IF((NOT DEFINED ASCEND_VER) OR (NOT DEFINED ASCEND_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(ASCEND_VER "0.1.1" CACHE STRING "" FORCE)
-  SET(ASCEND_NAME "ascend" CACHE STRING "" FORCE)
-  SET(ASCEND_URL "http://paddle-ascend.bj.bcebos.com/ascend.tar.gz" CACHE STRING "" FORCE)
-ENDIF()
-MESSAGE(STATUS "ASCEND_NAME: ${ASCEND_NAME}, ASCEND_URL: ${ASCEND_URL}")
-SET(ASCEND_SOURCE_DIR    "${THIRD_PARTY_PATH}/ascend")
-SET(ASCEND_DOWNLOAD_DIR  "${ASCEND_SOURCE_DIR}/src/${ASCEND_PROJECT}")
-SET(ASCEND_DST_DIR       "ascend")
-SET(ASCEND_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(ASCEND_INSTALL_DIR   ${ASCEND_INSTALL_ROOT}/${ASCEND_DST_DIR})
-SET(ASCEND_ROOT          ${ASCEND_INSTALL_DIR})
-SET(ASCEND_INC_DIR       ${ASCEND_ROOT}/include)
-SET(ASCEND_LIB_DIR       ${ASCEND_ROOT}/lib)
-SET(ASCEND_LIB           ${ASCEND_LIB_DIR}/libge_runner.so)
-SET(ASCEND_GRAPH_LIB           ${ASCEND_LIB_DIR}/libgraph.so)
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${ASCEND_ROOT}/lib")
-
-INCLUDE_DIRECTORIES(${ASCEND_INC_DIR})
-FILE(WRITE ${ASCEND_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(ASCEND)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${ASCEND_NAME}/include ${ASCEND_NAME}/lib \n"
-  "        DESTINATION ${ASCEND_DST_DIR})\n")
-ExternalProject_Add(
-    ${ASCEND_PROJECT}
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${ASCEND_SOURCE_DIR}
-    DOWNLOAD_DIR          ${ASCEND_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${ASCEND_URL} -c -q -O ${ASCEND_NAME}.tar.gz
-                          && tar zxvf ${ASCEND_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
-    UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${ASCEND_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${ASCEND_INSTALL_ROOT}
-)
-ADD_LIBRARY(ascend SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ascend PROPERTY IMPORTED_LOCATION ${ASCEND_LIB})
+
+#NOTE: Logic is from
+# https://github.com/mindspore-ai/graphengine/blob/master/CMakeLists.txt
+if(DEFINED ENV{ASCEND_CUSTOM_PATH})
+    set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH})
+else()
+    set(ASCEND_DIR /usr/local/Ascend)
+endif()
+
+set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
+set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
+set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
+set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64)
+set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64)
+set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64)
+set(STATIC_ACL_LIB ${ASCEND_ACL_DIR})
+
+set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR})
+set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR})
+set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
+set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64)
+set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64)
+set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR})
+
+set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so)
+set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so)
+set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
+INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
+
+if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
+  add_definitions(-DPADDLE_WITH_ASCEND_STRING)
+endif()
+
+ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
 
 ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${ASCEND_GRAPH_LIB})
-ADD_DEPENDENCIES(ascend ascend_graph ${ASCEND_PROJECT})
+SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib})
+
+ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib})
 
+add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl)
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index 0eb590c42d0cb7..582c06e88c1d45 100644
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -41,7 +41,7 @@ ExternalProject_Add(
         ${EXTERNAL_PROJECT_LOG_ARGS}
         # TODO(gongwb): change to de newst repo when they changed.
         GIT_REPOSITORY  "https://github.com/wangjiawei04/brpc"
-        GIT_TAG         "6d79e0b17f25107c35b705ea58d888083f59ff47"
+        GIT_TAG         "e203afb794caf027da0f1e0776443e7d20c0c28e"
         PREFIX          ${BRPC_SOURCES_DIR}
         UPDATE_COMMAND  ""
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index f68db1eab3d877..4619f9f7b7e34c 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -27,6 +27,14 @@ cache_third_party(extern_eigen3
 
 if(WIN32)
     add_definitions(-DEIGEN_STRONG_INLINE=inline)
+elseif(LINUX)
+    if(WITH_ROCM)
+        # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC
+        # which will cause compiler error of using __host__ funciont in __host__ __device__
+        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src)
+        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst)
+        set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst})
+    endif()
 endif()
 
 set(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR})
@@ -40,7 +48,7 @@ ExternalProject_Add(
     PREFIX          ${EIGEN_PREFIX_DIR}
     SOURCE_DIR      ${EIGEN_SOURCE_DIR}
     UPDATE_COMMAND    ""
-    PATCH_COMMAND     ""
+    PATCH_COMMAND     ${EIGEN_PATCH_COMMAND}
     CONFIGURE_COMMAND ""
     BUILD_COMMAND     ""
     INSTALL_COMMAND   ""
diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
index ea7af315e1a690..2e4a67093dc541 100644
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -32,21 +32,39 @@ cache_third_party(extern_gloo
     TAG           ${GLOO_TAG}
     DIR           GLOO_SOURCE_DIR)
 
-ExternalProject_Add(
-    extern_gloo
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    "${GLOO_DOWNLOAD_CMD}"
-    PREFIX                "${GLOO_PREFIX_DIR}"
-    SOURCE_DIR            "${GLOO_SOURCE_DIR}"
-    UPDATE_COMMAND        ""
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
-        && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
-        && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
-    INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
-    COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
-)
+if(WITH_ASCEND)
+  ExternalProject_Add(
+      extern_gloo
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      ${SHALLOW_CLONE}
+      "${GLOO_DOWNLOAD_CMD}"
+      PREFIX                "${GLOO_PREFIX_DIR}"
+      SOURCE_DIR            "${GLOO_SOURCE_DIR}"
+      UPDATE_COMMAND        ""
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
+          && cd ${GLOO_SOURCE_DIR}/build && cmake .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make
+          && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
+      INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
+      COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
+  )
+else()
+  ExternalProject_Add(
+      extern_gloo
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      ${SHALLOW_CLONE}
+      "${GLOO_DOWNLOAD_CMD}"
+      PREFIX                "${GLOO_PREFIX_DIR}"
+      SOURCE_DIR            "${GLOO_SOURCE_DIR}"
+      UPDATE_COMMAND        ""
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
+          && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
+          && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
+      INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
+      COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
+  )
+endif()
 
 
 ADD_LIBRARY(gloo STATIC IMPORTED GLOBAL)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 40a27f506f3077..1466664c1266a7 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -198,8 +198,13 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
     ENDIF()
 
+if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+    SET(PROTOBUF_REPOSITORY  https://gitee.com/tianjianhe/protobuf.git)
+    SET(PROTOBUF_TAG         v3.8.0)
+else()
     SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
     SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
+endif()
 
     cache_third_party(${TARGET_NAME}
         REPOSITORY    ${PROTOBUF_REPOSITORY}
@@ -234,7 +239,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     )
 ENDFUNCTION()
 
-SET(PROTOBUF_VERSION 3.1.0)
+if(WITH_ASCEND)
+    SET(PROTOBUF_VERSION 3.8.0)
+else()
+    SET(PROTOBUF_VERSION 3.1.0)
+endif()
 
 IF(NOT PROTOBUF_FOUND)
     build_protobuf(extern_protobuf FALSE)
diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
index 205e8d26d93ca1..0eabdb4e127bdf 100644
--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@@ -16,7 +16,11 @@ INCLUDE(ExternalProject)
 
 SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool)
 SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool)
-SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
+if(WITH_ASCEND)
+    SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git)
+else()
+    SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
+endif()
 SET(THREADPOOL_TAG        9a42ec1329f259a5f4881a291db1dcb8f2ad9040)
 
 cache_third_party(extern_threadpool
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index ac28f7561f60c5..a4367510ac703f 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -21,6 +21,8 @@ ENDIF()
 SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
+# in case of low internet speed  
+#set(WARPCTC_REPOSITORY  https://gitee.com/tianjianhe/warp-ctc.git)
 set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)
 set(WARPCTC_TAG         c690fc5755abbdbdc98ef78d51ec10a6748a8cd1)
 
@@ -41,39 +43,77 @@ cache_third_party(extern_warpctc
     TAG          ${WARPCTC_TAG}
     DIR          WARPCTC_SOURCE_DIR)
 
-ExternalProject_Add(
-    extern_warpctc
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    ${SHALLOW_CLONE}
-    "${WARPCTC_DOWNLOAD_CMD}"
-    PREFIX          ${WARPCTC_PREFIX_DIR}
-    SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
-    #UPDATE_COMMAND  ""
-    PATCH_COMMAND   ""
-    BUILD_ALWAYS    1
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_C_FLAGS=$<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>
-                    -DCMAKE_C_FLAGS_DEBUG=$<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
-                    -DCMAKE_C_FLAGS_RELEASE=$<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>
-                    -DCMAKE_CXX_FLAGS=$<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>
-                    -DCMAKE_CXX_FLAGS_RELEASE=$<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>
-                    -DCMAKE_CXX_FLAGS_DEBUG=$<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
-                    -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-                    -DWITH_GPU=${WITH_GPU}
-                    -DWITH_ROCM=${WITH_ROCM}
-                    -DWITH_OMP=${USE_OMP}
-                    -DWITH_TORCH=OFF
-                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-                    -DBUILD_SHARED=ON
-                    -DBUILD_TESTS=OFF
-                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                    ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
-)
+if(WITH_ASCEND)
+    ExternalProject_Add(
+        extern_warpctc
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        ${SHALLOW_CLONE}
+        "${WARPCTC_DOWNLOAD_CMD}"
+        PREFIX          ${WARPCTC_PREFIX_DIR}
+        SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
+        #UPDATE_COMMAND  ""
+        PATCH_COMMAND   ""
+        BUILD_ALWAYS    1
+        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                        -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+                        -DWITH_GPU=${WITH_GPU}
+                        -DWITH_ROCM=${WITH_ROCM}
+                        -DWITH_OMP=${USE_OMP}
+                        -DWITH_TORCH=OFF
+                        -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+                        -DBUILD_SHARED=ON
+                        -DBUILD_TESTS=OFF
+                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                        ${EXTERNAL_OPTIONAL_ARGS}
+        CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                         -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
+    )
+else()
+    ExternalProject_Add(
+        extern_warpctc
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        ${SHALLOW_CLONE}
+        "${WARPCTC_DOWNLOAD_CMD}"
+        PREFIX          ${WARPCTC_PREFIX_DIR}
+        SOURCE_DIR      ${WARPCTC_SOURCE_DIR}
+        #UPDATE_COMMAND  ""
+        PATCH_COMMAND   ""
+        BUILD_ALWAYS    1
+        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_C_FLAGS=$<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>
+                        -DCMAKE_C_FLAGS_DEBUG=$<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
+                        -DCMAKE_C_FLAGS_RELEASE=$<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>
+                        -DCMAKE_CXX_FLAGS=$<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>
+                        -DCMAKE_CXX_FLAGS_RELEASE=$<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>
+                        -DCMAKE_CXX_FLAGS_DEBUG=$<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>
+                        -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+                        -DWITH_GPU=${WITH_GPU}
+                        -DWITH_ROCM=${WITH_ROCM}
+                        -DWITH_OMP=${USE_OMP}
+                        -DWITH_TORCH=OFF
+                        -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+                        -DBUILD_SHARED=ON
+                        -DBUILD_TESTS=OFF
+                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                        ${EXTERNAL_OPTIONAL_ARGS}
+        CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                         -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
+    )
+endif()
+
+
 IF(WIN32)
     SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
             CACHE FILEPATH "Warp-ctc Library" FORCE)
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index c18332d3b87316..dcff02a662e273 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -9,4 +9,3 @@ add_subdirectory(pybind)
 
 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
-add_subdirectory(train)
diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/fleet.cc
index b638af49730dd4..9aafdd769ed4a0 100644
--- a/paddle/fluid/distributed/fleet.cc
+++ b/paddle/fluid/distributed/fleet.cc
@@ -146,41 +146,6 @@ void FleetWrapper::CreateClient2ClientConnection() {
       client2client_max_retry_);
 }
 
-std::future<int32_t> FleetWrapper::PullSparseVarsAsync(
-    const Scope& scope, const uint64_t table_id,
-    const std::vector<std::string>& var_names, std::vector<uint64_t>* fea_keys,
-    std::vector<std::vector<float>>* fea_values, int fea_value_dim) {
-  fea_keys->clear();
-  fea_keys->resize(0);
-  fea_keys->reserve(MAX_FEASIGN_NUM);
-  for (auto name : var_names) {
-    Variable* var = scope.FindVar(name);
-    if (var == nullptr) {
-      continue;
-    }
-    LoDTensor* tensor = var->GetMutable<LoDTensor>();
-    CHECK(tensor != nullptr) << "tensor of var " << name << " is null";
-    int64_t* ids = tensor->data<int64_t>();
-    size_t len = tensor->numel();
-    for (auto i = 0u; i < len; ++i) {
-      if (ids[i] == 0u) {
-        continue;
-      }
-      fea_keys->push_back(static_cast<uint64_t>(ids[i]));
-    }
-  }
-  fea_values->resize(fea_keys->size() + 1);
-  for (auto& t : *fea_values) {
-    t.resize(fea_value_dim);
-  }
-  std::vector<float*> pull_result_ptr;
-  for (auto& t : *fea_values) {
-    pull_result_ptr.push_back(t.data());
-  }
-  return pserver_ptr_->_worker_ptr->pull_sparse(
-      pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size());
-}
-
 void FleetWrapper::PullSparseVarsSync(
     const Scope& scope, const uint64_t table_id,
     const std::vector<std::string>& var_names, std::vector<uint64_t>* fea_keys,
@@ -224,8 +189,10 @@ void FleetWrapper::PullSparseVarsSync(
   for (auto& t : *fea_values) {
     pull_result_ptr.push_back(t.data());
   }
+  bool training = true;
   auto status = pserver_ptr_->_worker_ptr->pull_sparse(
-      pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size());
+      pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size(),
+      training);
   pull_sparse_status.push_back(std::move(status));
   for (auto& t : pull_sparse_status) {
     t.wait();
@@ -238,9 +205,13 @@ void FleetWrapper::PullSparseVarsSync(
   }
 }
 
+// is_training is true means training, false means inference, the behavior is
+// different on pserver
+
 void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
                                           uint64_t padding_id,
                                           platform::Place place,
+                                          bool is_training,
                                           std::vector<const LoDTensor*>* inputs,
                                           std::vector<LoDTensor*>* outputs) {
   std::vector<uint64_t> fea_keys;
@@ -279,7 +250,8 @@ void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
   }
   auto* communicator = Communicator::GetInstance();
   auto status = communicator->_worker_ptr->pull_sparse(
-      pull_result_ptr.data(), table_id, fea_keys.data(), fea_keys.size());
+      pull_result_ptr.data(), table_id, fea_keys.data(), fea_keys.size(),
+      is_training);
   status.wait();
   auto ret = status.get();
   if (ret != 0) {
diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h
index ac566606ddcb40..863440180a808d 100644
--- a/paddle/fluid/distributed/fleet.h
+++ b/paddle/fluid/distributed/fleet.h
@@ -84,19 +84,14 @@ class FleetWrapper {
                           int fea_dim,
                           const std::vector<std::string>& var_emb_names);
 
-  // Pull sparse variables from server in async mode
-  // Param<in>: scope, table_id, var_names, fea_keys, fea_dim
-  // Param<out>: fea_values std::future
-  std::future<int32_t> PullSparseVarsAsync(
-      const Scope& scope, const uint64_t table_id,
-      const std::vector<std::string>& var_names,
-      std::vector<uint64_t>* fea_keys,
-      std::vector<std::vector<float>>* fea_values, int fea_dim);
-
   // Pull sparse variables from server in sync mode
   // pull immediately to tensors
+  // is_training is true means training, false means inference, the behavior is
+  // different on pserver
+
   void PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
                               uint64_t padding_id, platform::Place place,
+                              bool is_training,
                               std::vector<const LoDTensor*>* inputs,  // NOLINT
                               std::vector<LoDTensor*>* outputs);      // NOLINT
 
diff --git a/paddle/fluid/distributed/service/CMakeLists.txt b/paddle/fluid/distributed/service/CMakeLists.txt
index bb3f6f1174da9d..843dea9eea6ef9 100644
--- a/paddle/fluid/distributed/service/CMakeLists.txt
+++ b/paddle/fluid/distributed/service/CMakeLists.txt
@@ -24,11 +24,12 @@ set_source_files_properties(heter_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUT
 set_source_files_properties(client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-
+set_source_files_properties(graph_brpc_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(graph_brpc_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(brpc_utils SRCS brpc_utils.cc DEPS tensor device_context ${COMMON_DEPS} ${RPC_DEPS})
 
-cc_library(downpour_server SRCS brpc_ps_server.cc DEPS boost eigen3 table brpc_utils ${RPC_DEPS})
-cc_library(downpour_client SRCS brpc_ps_client.cc DEPS boost eigen3 table brpc_utils ${RPC_DEPS})
+cc_library(downpour_server SRCS graph_brpc_server.cc brpc_ps_server.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
+cc_library(downpour_client SRCS graph_brpc_client.cc brpc_ps_client.cc  DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS})
 
 cc_library(client SRCS ps_client.cc DEPS downpour_client boost ${RPC_DEPS})
 cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS})
@@ -38,3 +39,6 @@ cc_library(ps_service SRCS service.cc DEPS communicator client server boost ${RP
 
 cc_library(heter_server SRCS heter_server.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
 cc_library(heter_client SRCS heter_client.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS})
+
+set_source_files_properties(graph_py_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(graph_py_service SRCS graph_py_service.cc DEPS ps_service)
diff --git a/paddle/fluid/distributed/service/brpc_ps_client.cc b/paddle/fluid/distributed/service/brpc_ps_client.cc
index 163526fe3b28c9..b49a71ab0c13ad 100644
--- a/paddle/fluid/distributed/service/brpc_ps_client.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_client.cc
@@ -768,8 +768,8 @@ std::future<int32_t> BrpcPsClient::push_global_step(int table_id,
 
 std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
                                                size_t table_id,
-                                               const uint64_t *keys,
-                                               size_t num) {
+                                               const uint64_t *keys, size_t num,
+                                               bool is_training) {
   size_t request_call_num = _server_channels.size();
 
   auto shard_sorted_kvs = std::make_shared<
@@ -837,16 +837,27 @@ std::future<int32_t> BrpcPsClient::pull_sparse(float **select_values,
     uint32_t kv_request_count = 0;
     size_t sorted_kv_size = sorted_kvs.size();
     auto &request_buffer = closure->cntl(i)->request_attachment();
+
+    request_buffer.append((void *)&is_training, sizeof(bool));
+    std::vector<uint32_t> keys_counter;
+    keys_counter.reserve(sorted_kv_size);
+
     for (size_t kv_idx = 0; kv_idx < sorted_kv_size; ++kv_idx) {
       ++kv_request_count;
+      uint32_t keys = 1;
       last_key = sorted_kvs[kv_idx].first;
       request_buffer.append((void *)&last_key, sizeof(uint64_t));
       while (kv_idx < sorted_kv_size - 1 &&
              last_key == sorted_kvs[kv_idx + 1].first) {
         ++kv_idx;
+        ++keys;
       }
+      keys_counter.push_back(keys);
     }
 
+    request_buffer.append((void *)keys_counter.data(),
+                          sizeof(uint32_t) * keys_counter.size());
+
     if (kv_request_count == 0) {
       closure->Run();
     } else {
@@ -956,7 +967,7 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id,
   }
 
   auto status = pull_sparse((float **)save_vec.data(), table_id,
-                            save_key.data(), save_key.size());
+                            save_key.data(), save_key.size(), true);
   status.wait();
 
   // create lod tensor
@@ -990,4 +1001,4 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id,
 }
 
 }  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/fluid/distributed/service/brpc_ps_client.h b/paddle/fluid/distributed/service/brpc_ps_client.h
index 8f9d2653864d1c..5192356e4b5e57 100644
--- a/paddle/fluid/distributed/service/brpc_ps_client.h
+++ b/paddle/fluid/distributed/service/brpc_ps_client.h
@@ -148,7 +148,8 @@ class BrpcPsClient : public PSClient {
 
   virtual std::future<int32_t> pull_sparse(float **select_values,
                                            size_t table_id,
-                                           const uint64_t *keys, size_t num);
+                                           const uint64_t *keys, size_t num,
+                                           bool is_training);
 
   virtual std::future<int32_t> print_table_stat(uint32_t table_id);
 
@@ -170,9 +171,22 @@ class BrpcPsClient : public PSClient {
   virtual int32_t recv_and_save_table(const uint64_t table_id,
                                       const std::string &path);
 
- private:
+ protected:
+  virtual size_t get_server_nums() { return _server_channels.size(); }
+  inline brpc::Channel *get_sparse_channel(size_t server_id) {
+    return _server_channels[server_id][0].get();
+  }
+  inline brpc::Channel *get_dense_channel(size_t server_id) {
+    return _server_channels[server_id][1].get();
+  }
+  inline brpc::Channel *get_cmd_channel(size_t server_id) {
+    return _server_channels[server_id][2].get();
+  }
   virtual int32_t initialize() override;
 
+ private:
+  // virtual int32_t initialize() override;
+
   inline uint32_t dense_dim_per_shard(uint32_t dense_dim_total,
                                       uint32_t shard_num) {
     return dense_dim_total / shard_num + 1;
@@ -184,16 +198,6 @@ class BrpcPsClient : public PSClient {
   std::future<int32_t> send_save_cmd(uint32_t table_id, int cmd_id,
                                      const std::vector<std::string> &param);
 
-  inline brpc::Channel *get_sparse_channel(size_t server_id) {
-    return _server_channels[server_id][0].get();
-  }
-  inline brpc::Channel *get_dense_channel(size_t server_id) {
-    return _server_channels[server_id][1].get();
-  }
-  inline brpc::Channel *get_cmd_channel(size_t server_id) {
-    return _server_channels[server_id][2].get();
-  }
-
   bool _running = false;
   bool _flushing = false;
   std::atomic<uint32_t> _async_call_num;  //异步请求计数
@@ -220,8 +224,6 @@ class BrpcPsClient : public PSClient {
                                                  size_t num,
                                                  void *done) override;
 
-  virtual size_t get_server_nums() { return _server_channels.size(); }
-
  private:
   int32_t start_client_service();
 
diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/service/brpc_ps_server.cc
index 8400e669182d67..a9370561a540be 100644
--- a/paddle/fluid/distributed/service/brpc_ps_server.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_server.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/distributed/service/brpc_ps_server.h"
 #include <thread>  // NOLINT
+#include "paddle/fluid/distributed/table/depends/sparse_utils.h"
 #include "paddle/fluid/distributed/table/table.h"
 #include "paddle/fluid/framework/archive.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -60,7 +61,8 @@ uint64_t BrpcPsServer::start(const std::string &ip, uint32_t port) {
   std::unique_lock<std::mutex> lock(mutex_);
 
   std::string ip_port = ip + ":" + std::to_string(port);
-  VLOG(3) << "server of rank " << _rank << " starts at " << ip_port;
+  VLOG(0) << "running server with rank id: " << _rank
+          << ", endpoint: " << ip_port;
   brpc::ServerOptions options;
 
   int num_threads = std::thread::hardware_concurrency();
@@ -336,33 +338,39 @@ int32_t BrpcPsService::pull_sparse(Table *table,
                                    brpc::Controller *cntl) {
   platform::RecordEvent record_event("PsService->pull_sparse");
   CHECK_TABLE_EXIST(table, request, response)
-  thread_local std::string push_sparse_request_buffer;
+
   auto &req_io_buffer = cntl->request_attachment();
   auto req_buffer_size = req_io_buffer.size();
+
   if (req_buffer_size < 1) {
     set_response_code(response, -1, "req attachment is empty");
     return 0;
   }
+
   if (request.params_size() < 1) {
     set_response_code(response, -1,
                       "PsRequestMessage.params is requeired at "
                       "least 1 for num of sparse_key");
     return 0;
   }
+
   uint32_t num = *(uint32_t *)(request.params(0).c_str());
-  push_sparse_request_buffer.resize(0);
-  push_sparse_request_buffer.reserve(req_buffer_size);
-  const char *data = (const char *)cntl->request_attachment().fetch(
-      const_cast<char *>(push_sparse_request_buffer.data()), req_buffer_size);
-  /*
-  Attachment Content:
-  |---keysData---|
-  |---8*{num}B---|
-  */
-  const uint64_t *keys = (const uint64_t *)data;
+  auto dim = table->value_accesor()->select_dim();
+
+  thread_local std::string req_buffer;
+  req_buffer.reserve(req_buffer_size);
+
+  const void *data = cntl->request_attachment().fetch(
+      const_cast<char *>(req_buffer.data()), req_buffer_size);
+
+  auto value = PullSparseValue(num, dim);
+
+  value.DeserializeFromBytes(const_cast<void *>(data));
+
   std::vector<float> res_data;
-  res_data.resize(num * table->value_accesor()->select_size() / sizeof(float));
-  table->pull_sparse(res_data.data(), keys, num);
+  res_data.resize(num * dim);
+  table->pull_sparse(res_data.data(), value);
+
   cntl->response_attachment().append((char *)res_data.data(),
                                      res_data.size() * sizeof(float));
   return 0;
@@ -538,7 +546,7 @@ int32_t BrpcPsService::stop_server(Table *table,
   auto *p_server = _server;
   std::thread t_stop([p_server]() {
     p_server->stop();
-    LOG(INFO) << "Server Stoped";
+    VLOG(3) << "Server Stoped";
   });
   t_stop.detach();
   return 0;
diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc
index 096718768149c5..a356b77e73733e 100644
--- a/paddle/fluid/distributed/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/service/brpc_utils.cc
@@ -324,7 +324,7 @@ std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port) {
 
   while (hp->h_addr_list[i] != NULL) {
     int_ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]);
-    VLOG(0) << "Brpc Get host by name, host:" << ip << " -> ip: " << int_ip;
+    VLOG(3) << "Brpc Get host by name, host:" << ip << " -> ip: " << int_ip;
     break;
   }
 
diff --git a/paddle/fluid/distributed/service/communicator.cc b/paddle/fluid/distributed/service/communicator.cc
index 8699719e5cdcc8..3d5ab8e16d9020 100644
--- a/paddle/fluid/distributed/service/communicator.cc
+++ b/paddle/fluid/distributed/service/communicator.cc
@@ -320,9 +320,11 @@ void Communicator::RpcRecvSparse(const std::string &varname, int table_id,
     push_g_vec.push_back(tensor->data<float>() + i * dim);
   }
 
+  bool training = true;
+
   auto status = _worker_ptr->pull_sparse(
       (float **)push_g_vec.data(), table_id,  // NOLINT
-      sparse_push_keys.data(), sparse_push_keys.size());
+      sparse_push_keys.data(), sparse_push_keys.size(), training);
   status.wait();
   return;
 }
diff --git a/paddle/fluid/distributed/service/env.h b/paddle/fluid/distributed/service/env.h
index 901aba0ad90c49..ca395a776afd4e 100644
--- a/paddle/fluid/distributed/service/env.h
+++ b/paddle/fluid/distributed/service/env.h
@@ -39,7 +39,7 @@ struct PSHost {
 
   // |---ip---|---port---|--rank--|
   // |-32bit--|--20bit---|--12bit-|
-  // for pslib
+
   uint64_t serialize_to_uint64() {
     uint64_t host_label = 0;
     host_label = inet_addr(ip.c_str());
@@ -175,14 +175,12 @@ class PSEnvironment {
     host.ip = ip;
     host.port = port;
     host.rank = rank;
-    if (sign_set.count(rank) > 0) {
-      LOG(WARNING) << "ps-host :" << host.ip << ":" << host.port
-                   << ", rank:" << host.rank
-                   << " already register, ignore register";
-    } else {
+
+    if (sign_set.count(rank) == 0) {
       host_list.push_back(host);
       sign_set.insert(rank);
     }
+
     return 0;
   }
 
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc
new file mode 100644
index 00000000000000..a6271cac83c9a9
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_brpc_client.cc
@@ -0,0 +1,331 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include <algorithm>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+#include "Eigen/Dense"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace distributed {
+
+void GraphPsService_Stub::service(
+    ::google::protobuf::RpcController *controller,
+    const ::paddle::distributed::PsRequestMessage *request,
+    ::paddle::distributed::PsResponseMessage *response,
+    ::google::protobuf::Closure *done) {
+  if (graph_service != NULL && local_channel == channel()) {
+    // VLOG(0)<<"use local";
+    task_pool->enqueue([this, controller, request, response, done]() -> int {
+      this->graph_service->service(controller, request, response, done);
+      return 0;
+    });
+  } else {
+    // VLOG(0)<<"use server";
+    PsService_Stub::service(controller, request, response, done);
+  }
+}
+
+int GraphBrpcClient::get_server_index_by_id(uint64_t id) {
+  int shard_num = get_shard_num();
+  int shard_per_server = shard_num % server_size == 0
+                             ? shard_num / server_size
+                             : shard_num / server_size + 1;
+  return id % shard_num / shard_per_server;
+}
+
+std::future<int32_t> GraphBrpcClient::get_node_feat(
+    const uint32_t &table_id, const std::vector<uint64_t> &node_ids,
+    const std::vector<std::string> &feature_names,
+    std::vector<std::vector<std::string>> &res) {
+  std::vector<int> request2server;
+  std::vector<int> server2request(server_size, -1);
+  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_ids[query_idx]);
+    if (server2request[server_index] == -1) {
+      server2request[server_index] = request2server.size();
+      request2server.push_back(server_index);
+    }
+  }
+  size_t request_call_num = request2server.size();
+  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int>> query_idx_buckets(request_call_num);
+  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_ids[query_idx]);
+    int request_idx = server2request[server_index];
+    node_id_buckets[request_idx].push_back(node_ids[query_idx]);
+    query_idx_buckets[request_idx].push_back(query_idx);
+  }
+
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num,
+      [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        int fail_num = 0;
+        for (int request_idx = 0; request_idx < request_call_num;
+             ++request_idx) {
+          if (closure->check_response(request_idx,
+                                      PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) {
+            ++fail_num;
+          } else {
+            auto &res_io_buffer =
+                closure->cntl(request_idx)->response_attachment();
+            butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+            size_t bytes_size = io_buffer_itr.bytes_left();
+            std::unique_ptr<char[]> buffer_wrapper(new char[bytes_size]);
+            char *buffer = buffer_wrapper.get();
+            io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
+
+            for (size_t feat_idx = 0; feat_idx < feature_names.size();
+                 ++feat_idx) {
+              for (size_t node_idx = 0;
+                   node_idx < query_idx_buckets.at(request_idx).size();
+                   ++node_idx) {
+                int query_idx = query_idx_buckets.at(request_idx).at(node_idx);
+                size_t feat_len = *(size_t *)(buffer);
+                buffer += sizeof(size_t);
+                auto feature = std::string(buffer, feat_len);
+                res[feat_idx][query_idx] = feature;
+                buffer += feat_len;
+              }
+            }
+          }
+          if (fail_num == request_call_num) {
+            ret = -1;
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+
+  for (int request_idx = 0; request_idx < request_call_num; ++request_idx) {
+    int server_index = request2server[request_idx];
+    closure->request(request_idx)->set_cmd_id(PS_GRAPH_GET_NODE_FEAT);
+    closure->request(request_idx)->set_table_id(table_id);
+    closure->request(request_idx)->set_client_id(_client_id);
+    size_t node_num = node_id_buckets[request_idx].size();
+
+    closure->request(request_idx)
+        ->add_params((char *)node_id_buckets[request_idx].data(),
+                     sizeof(uint64_t) * node_num);
+    std::string joint_feature_name =
+        paddle::string::join_strings(feature_names, '\t');
+    closure->request(request_idx)
+        ->add_params(joint_feature_name.c_str(), joint_feature_name.size());
+
+    PsService_Stub rpc_stub(get_cmd_channel(server_index));
+    closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx),
+                     closure->response(request_idx), closure);
+  }
+
+  return fut;
+}
+// char* &buffer,int &actual_size
+std::future<int32_t> GraphBrpcClient::batch_sample_neighboors(
+    uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
+    std::vector<std::vector<std::pair<uint64_t, float>>> &res) {
+  std::vector<int> request2server;
+  std::vector<int> server2request(server_size, -1);
+  res.clear();
+  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_ids[query_idx]);
+    if (server2request[server_index] == -1) {
+      server2request[server_index] = request2server.size();
+      request2server.push_back(server_index);
+    }
+    res.push_back(std::vector<std::pair<uint64_t, float>>());
+  }
+  size_t request_call_num = request2server.size();
+  std::vector<std::vector<uint64_t>> node_id_buckets(request_call_num);
+  std::vector<std::vector<int>> query_idx_buckets(request_call_num);
+  for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) {
+    int server_index = get_server_index_by_id(node_ids[query_idx]);
+    int request_idx = server2request[server_index];
+    node_id_buckets[request_idx].push_back(node_ids[query_idx]);
+    query_idx_buckets[request_idx].push_back(query_idx);
+  }
+
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(
+      request_call_num,
+      [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) {
+        int ret = 0;
+        auto *closure = (DownpourBrpcClosure *)done;
+        int fail_num = 0;
+        for (int request_idx = 0; request_idx < request_call_num;
+             ++request_idx) {
+          if (closure->check_response(request_idx,
+                                      PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) {
+            ++fail_num;
+          } else {
+            auto &res_io_buffer =
+                closure->cntl(request_idx)->response_attachment();
+            butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+            size_t bytes_size = io_buffer_itr.bytes_left();
+            std::unique_ptr<char[]> buffer_wrapper(new char[bytes_size]);
+            char *buffer = buffer_wrapper.get();
+            io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
+
+            size_t node_num = *(size_t *)buffer;
+            int *actual_sizes = (int *)(buffer + sizeof(size_t));
+            char *node_buffer =
+                buffer + sizeof(size_t) + sizeof(int) * node_num;
+
+            int offset = 0;
+            for (size_t node_idx = 0; node_idx < node_num; ++node_idx) {
+              int query_idx = query_idx_buckets.at(request_idx).at(node_idx);
+              int actual_size = actual_sizes[node_idx];
+              int start = 0;
+              while (start < actual_size) {
+                res[query_idx].push_back(
+                    {*(uint64_t *)(node_buffer + offset + start),
+                     *(float *)(node_buffer + offset + start +
+                                GraphNode::id_size)});
+                start += GraphNode::id_size + GraphNode::weight_size;
+              }
+              offset += actual_size;
+            }
+          }
+          if (fail_num == request_call_num) {
+            ret = -1;
+          }
+        }
+        closure->set_promise_value(ret);
+      });
+
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+
+  for (int request_idx = 0; request_idx < request_call_num; ++request_idx) {
+    int server_index = request2server[request_idx];
+    closure->request(request_idx)->set_cmd_id(PS_GRAPH_SAMPLE_NEIGHBOORS);
+    closure->request(request_idx)->set_table_id(table_id);
+    closure->request(request_idx)->set_client_id(_client_id);
+    size_t node_num = node_id_buckets[request_idx].size();
+
+    closure->request(request_idx)
+        ->add_params((char *)node_id_buckets[request_idx].data(),
+                     sizeof(uint64_t) * node_num);
+    closure->request(request_idx)
+        ->add_params((char *)&sample_size, sizeof(int));
+    // PsService_Stub rpc_stub(get_cmd_channel(server_index));
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
+    closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms());
+    rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx),
+                     closure->response(request_idx), closure);
+  }
+
+  return fut;
+}
+std::future<int32_t> GraphBrpcClient::random_sample_nodes(
+    uint32_t table_id, int server_index, int sample_size,
+    std::vector<uint64_t> &ids) {
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) {
+    int ret = 0;
+    auto *closure = (DownpourBrpcClosure *)done;
+    if (closure->check_response(0, PS_GRAPH_SAMPLE_NODES) != 0) {
+      ret = -1;
+    } else {
+      auto &res_io_buffer = closure->cntl(0)->response_attachment();
+      butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+      size_t bytes_size = io_buffer_itr.bytes_left();
+      char buffer[bytes_size];
+      auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
+      int index = 0;
+      while (index < bytes_size) {
+        ids.push_back(*(uint64_t *)(buffer + index));
+        index += GraphNode::id_size;
+      }
+    }
+    closure->set_promise_value(ret);
+  });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  ;
+  closure->request(0)->set_cmd_id(PS_GRAPH_SAMPLE_NODES);
+  closure->request(0)->set_table_id(table_id);
+  closure->request(0)->set_client_id(_client_id);
+  closure->request(0)->add_params((char *)&sample_size, sizeof(int));
+  ;
+  // PsService_Stub rpc_stub(get_cmd_channel(server_index));
+  GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index));
+  closure->cntl(0)->set_log_id(butil::gettimeofday_ms());
+  rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0),
+                   closure);
+  return fut;
+}
+std::future<int32_t> GraphBrpcClient::pull_graph_list(
+    uint32_t table_id, int server_index, int start, int size, int step,
+    std::vector<FeatureNode> &res) {
+  DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) {
+    int ret = 0;
+    auto *closure = (DownpourBrpcClosure *)done;
+    if (closure->check_response(0, PS_PULL_GRAPH_LIST) != 0) {
+      ret = -1;
+    } else {
+      auto &res_io_buffer = closure->cntl(0)->response_attachment();
+      butil::IOBufBytesIterator io_buffer_itr(res_io_buffer);
+      size_t bytes_size = io_buffer_itr.bytes_left();
+      char buffer[bytes_size];
+      io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size);
+      int index = 0;
+      while (index < bytes_size) {
+        FeatureNode node;
+        node.recover_from_buffer(buffer + index);
+        index += node.get_size(false);
+        res.push_back(node);
+      }
+    }
+    closure->set_promise_value(ret);
+  });
+  auto promise = std::make_shared<std::promise<int32_t>>();
+  closure->add_promise(promise);
+  std::future<int> fut = promise->get_future();
+  closure->request(0)->set_cmd_id(PS_PULL_GRAPH_LIST);
+  closure->request(0)->set_table_id(table_id);
+  closure->request(0)->set_client_id(_client_id);
+  closure->request(0)->add_params((char *)&start, sizeof(int));
+  closure->request(0)->add_params((char *)&size, sizeof(int));
+  closure->request(0)->add_params((char *)&step, sizeof(int));
+  // PsService_Stub rpc_stub(get_cmd_channel(server_index));
+  GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index));
+  closure->cntl(0)->set_log_id(butil::gettimeofday_ms());
+  rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0),
+                   closure);
+  return fut;
+}
+int32_t GraphBrpcClient::initialize() {
+  // set_shard_num(_config.shard_num());
+  BrpcPsClient::initialize();
+  server_size = get_server_nums();
+  graph_service = NULL;
+  local_channel = NULL;
+  return 0;
+}
+}
+}
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h
new file mode 100644
index 00000000000000..4e6775a4bedaf1
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_brpc_client.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ThreadPool.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <utility>
+#include "ThreadPool.h"
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace paddle {
+namespace distributed {
+
+class GraphPsService_Stub : public PsService_Stub {
+ public:
+  GraphPsService_Stub(::google::protobuf::RpcChannel* channel,
+                      ::google::protobuf::RpcChannel* local_channel = NULL,
+                      GraphBrpcService* service = NULL, int thread_num = 1)
+      : PsService_Stub(channel) {
+    this->local_channel = local_channel;
+    this->graph_service = service;
+    task_pool.reset(new ::ThreadPool(thread_num));
+  }
+  virtual ~GraphPsService_Stub() {}
+
+  // implements PsService ------------------------------------------
+  GraphBrpcService* graph_service;
+  std::shared_ptr<::ThreadPool> task_pool;
+  ::google::protobuf::RpcChannel* local_channel;
+  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(GraphPsService_Stub);
+  void service(::google::protobuf::RpcController* controller,
+               const ::paddle::distributed::PsRequestMessage* request,
+               ::paddle::distributed::PsResponseMessage* response,
+               ::google::protobuf::Closure* done);
+};
+class GraphBrpcClient : public BrpcPsClient {
+ public:
+  GraphBrpcClient() {}
+  virtual ~GraphBrpcClient() {}
+  // given a batch of nodes, sample graph_neighboors for each of them
+  virtual std::future<int32_t> batch_sample_neighboors(
+      uint32_t table_id, std::vector<uint64_t> node_ids, int sample_size,
+      std::vector<std::vector<std::pair<uint64_t, float>>>& res);
+
+  virtual std::future<int32_t> pull_graph_list(uint32_t table_id,
+                                               int server_index, int start,
+                                               int size, int step,
+                                               std::vector<FeatureNode>& res);
+  virtual std::future<int32_t> random_sample_nodes(uint32_t table_id,
+                                                   int server_index,
+                                                   int sample_size,
+                                                   std::vector<uint64_t>& ids);
+  virtual std::future<int32_t> get_node_feat(
+      const uint32_t& table_id, const std::vector<uint64_t>& node_ids,
+      const std::vector<std::string>& feature_names,
+      std::vector<std::vector<std::string>>& res);
+  virtual int32_t initialize();
+  int get_shard_num() { return shard_num; }
+  void set_shard_num(int shard_num) { this->shard_num = shard_num; }
+  int get_server_index_by_id(uint64_t id);
+  void set_local_channel(int index) {
+    this->local_channel = get_cmd_channel(index);
+  }
+  void set_local_graph_service(GraphBrpcService* graph_service) {
+    this->graph_service = graph_service;
+  }
+  GraphPsService_Stub getServiceStub(::google::protobuf::RpcChannel* channel,
+                                     int thread_num = 1) {
+    return GraphPsService_Stub(channel, local_channel, graph_service,
+                               thread_num);
+  }
+
+ private:
+  int shard_num;
+  size_t server_size;
+  ::google::protobuf::RpcChannel* local_channel;
+  GraphBrpcService* graph_service;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc
new file mode 100644
index 00000000000000..bdd926278b624b
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_brpc_server.cc
@@ -0,0 +1,348 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+
+#include <thread>  // NOLINT
+#include "butil/endpoint.h"
+#include "iomanip"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/platform/profiler.h"
+namespace paddle {
+namespace distributed {
+
+int32_t GraphBrpcServer::initialize() {
+  auto &service_config = _config.downpour_server_param().service_param();
+  if (!service_config.has_service_class()) {
+    LOG(ERROR) << "miss service_class in ServerServiceParameter";
+    return -1;
+  }
+  auto *service =
+      CREATE_PSCORE_CLASS(PsBaseService, service_config.service_class());
+  if (service == NULL) {
+    LOG(ERROR) << "service is unregistered, service_name:"
+               << service_config.service_class();
+    return -1;
+  }
+
+  _service.reset(service);
+  if (service->configure(this) != 0 || service->initialize() != 0) {
+    LOG(ERROR) << "service initialize failed, service_name:"
+               << service_config.service_class();
+    return -1;
+  }
+  if (_server.AddService(service, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) {
+    LOG(ERROR) << "service add to brpc failed, service:"
+               << service_config.service_class();
+    return -1;
+  }
+  return 0;
+}
+
+uint64_t GraphBrpcServer::start(const std::string &ip, uint32_t port) {
+  std::unique_lock<std::mutex> lock(mutex_);
+
+  std::string ip_port = ip + ":" + std::to_string(port);
+  VLOG(3) << "server of rank " << _rank << " starts at " << ip_port;
+  brpc::ServerOptions options;
+
+  int num_threads = std::thread::hardware_concurrency();
+  auto trainers = _environment->get_trainers();
+  options.num_threads = trainers > num_threads ? trainers : num_threads;
+
+  if (_server.Start(ip_port.c_str(), &options) != 0) {
+    LOG(ERROR) << "GraphBrpcServer start failed, ip_port=" << ip_port;
+    return 0;
+  }
+  _environment->registe_ps_server(ip, port, _rank);
+  return 0;
+}
+
+int32_t GraphBrpcServer::port() { return _server.listen_address().port; }
+
+int32_t GraphBrpcService::initialize() {
+  _is_initialize_shard_info = false;
+  _service_handler_map[PS_STOP_SERVER] = &GraphBrpcService::stop_server;
+  _service_handler_map[PS_LOAD_ONE_TABLE] = &GraphBrpcService::load_one_table;
+  _service_handler_map[PS_LOAD_ALL_TABLE] = &GraphBrpcService::load_all_table;
+
+  _service_handler_map[PS_PRINT_TABLE_STAT] =
+      &GraphBrpcService::print_table_stat;
+  _service_handler_map[PS_BARRIER] = &GraphBrpcService::barrier;
+  _service_handler_map[PS_START_PROFILER] = &GraphBrpcService::start_profiler;
+  _service_handler_map[PS_STOP_PROFILER] = &GraphBrpcService::stop_profiler;
+
+  _service_handler_map[PS_PULL_GRAPH_LIST] = &GraphBrpcService::pull_graph_list;
+  _service_handler_map[PS_GRAPH_SAMPLE_NEIGHBOORS] =
+      &GraphBrpcService::graph_random_sample_neighboors;
+  _service_handler_map[PS_GRAPH_SAMPLE_NODES] =
+      &GraphBrpcService::graph_random_sample_nodes;
+  _service_handler_map[PS_GRAPH_GET_NODE_FEAT] =
+      &GraphBrpcService::graph_get_node_feat;
+
+  // shard初始化,server启动后才可从env获取到server_list的shard信息
+  initialize_shard_info();
+
+  return 0;
+}
+
+#define CHECK_TABLE_EXIST(table, request, response)        \
+  if (table == NULL) {                                     \
+    std::string err_msg("table not found with table_id:"); \
+    err_msg.append(std::to_string(request.table_id()));    \
+    set_response_code(response, -1, err_msg.c_str());      \
+    return -1;                                             \
+  }
+
+int32_t GraphBrpcService::initialize_shard_info() {
+  if (!_is_initialize_shard_info) {
+    std::lock_guard<std::mutex> guard(_initialize_shard_mutex);
+    if (_is_initialize_shard_info) {
+      return 0;
+    }
+    size_t shard_num = _server->environment()->get_ps_servers().size();
+    auto &table_map = *(_server->table());
+    for (auto itr : table_map) {
+      itr.second->set_shard(_rank, shard_num);
+    }
+    _is_initialize_shard_info = true;
+  }
+  return 0;
+}
+
+void GraphBrpcService::service(google::protobuf::RpcController *cntl_base,
+                               const PsRequestMessage *request,
+                               PsResponseMessage *response,
+                               google::protobuf::Closure *done) {
+  brpc::ClosureGuard done_guard(done);
+  std::string log_label("ReceiveCmd-");
+  if (!request->has_table_id()) {
+    set_response_code(*response, -1, "PsRequestMessage.tabel_id is required");
+    return;
+  }
+
+  response->set_err_code(0);
+  response->set_err_msg("");
+  auto *table = _server->table(request->table_id());
+  brpc::Controller *cntl = static_cast<brpc::Controller *>(cntl_base);
+  auto itr = _service_handler_map.find(request->cmd_id());
+  if (itr == _service_handler_map.end()) {
+    std::string err_msg(
+        "undefined cmd_id, should match PsCmdID in ps.proto, cmd_id:");
+    err_msg.append(std::to_string(request->cmd_id()));
+    set_response_code(*response, -1, err_msg.c_str());
+    return;
+  }
+  serviceFunc handler_func = itr->second;
+  int service_ret = (this->*handler_func)(table, *request, *response, cntl);
+  if (service_ret != 0) {
+    response->set_err_code(service_ret);
+    response->set_err_msg("server internal error");
+  }
+}
+
+int32_t GraphBrpcService::barrier(Table *table, const PsRequestMessage &request,
+                                  PsResponseMessage &response,
+                                  brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+
+  if (request.params_size() < 1) {
+    set_response_code(response, -1,
+                      "PsRequestMessage.params is requeired at "
+                      "least 1 for num of sparse_key");
+    return 0;
+  }
+
+  auto trainer_id = request.client_id();
+  auto barrier_type = request.params(0);
+  table->barrier(trainer_id, barrier_type);
+  return 0;
+}
+
+int32_t GraphBrpcService::print_table_stat(Table *table,
+                                           const PsRequestMessage &request,
+                                           PsResponseMessage &response,
+                                           brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  std::pair<int64_t, int64_t> ret = table->print_table_stat();
+  paddle::framework::BinaryArchive ar;
+  ar << ret.first << ret.second;
+  std::string table_info(ar.Buffer(), ar.Length());
+  response.set_data(table_info);
+
+  return 0;
+}
+
+int32_t GraphBrpcService::load_one_table(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 2) {
+    set_response_code(
+        response, -1,
+        "PsRequestMessage.datas is requeired at least 2 for path & load_param");
+    return -1;
+  }
+  if (table->load(request.params(0), request.params(1)) != 0) {
+    set_response_code(response, -1, "table load failed");
+    return -1;
+  }
+  return 0;
+}
+
+int32_t GraphBrpcService::load_all_table(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl) {
+  auto &table_map = *(_server->table());
+  for (auto &itr : table_map) {
+    if (load_one_table(itr.second.get(), request, response, cntl) != 0) {
+      LOG(ERROR) << "load table[" << itr.first << "] failed";
+      return -1;
+    }
+  }
+  return 0;
+}
+
+int32_t GraphBrpcService::stop_server(Table *table,
+                                      const PsRequestMessage &request,
+                                      PsResponseMessage &response,
+                                      brpc::Controller *cntl) {
+  GraphBrpcServer *p_server = (GraphBrpcServer *)_server;
+  std::thread t_stop([p_server]() {
+    p_server->stop();
+    LOG(INFO) << "Server Stoped";
+  });
+  p_server->export_cv()->notify_all();
+  t_stop.detach();
+  return 0;
+}
+
+int32_t GraphBrpcService::stop_profiler(Table *table,
+                                        const PsRequestMessage &request,
+                                        PsResponseMessage &response,
+                                        brpc::Controller *cntl) {
+  platform::DisableProfiler(platform::EventSortingKey::kDefault,
+                            string::Sprintf("server_%s_profile", _rank));
+  return 0;
+}
+
+int32_t GraphBrpcService::start_profiler(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl) {
+  platform::EnableProfiler(platform::ProfilerState::kCPU);
+  return 0;
+}
+
+int32_t GraphBrpcService::pull_graph_list(Table *table,
+                                          const PsRequestMessage &request,
+                                          PsResponseMessage &response,
+                                          brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 3) {
+    set_response_code(response, -1,
+                      "pull_graph_list request requires at least 3 arguments");
+    return 0;
+  }
+  int start = *(int *)(request.params(0).c_str());
+  int size = *(int *)(request.params(1).c_str());
+  int step = *(int *)(request.params(2).c_str());
+  std::unique_ptr<char[]> buffer;
+  int actual_size;
+  ((GraphTable *)table)
+      ->pull_graph_list(start, size, buffer, actual_size, false, step);
+  cntl->response_attachment().append(buffer.get(), actual_size);
+  return 0;
+}
+int32_t GraphBrpcService::graph_random_sample_neighboors(
+    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
+    brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 2) {
+    set_response_code(
+        response, -1,
+        "graph_random_sample request requires at least 2 arguments");
+    return 0;
+  }
+  size_t node_num = request.params(0).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
+  int sample_size = *(uint64_t *)(request.params(1).c_str());
+  std::vector<std::unique_ptr<char[]>> buffers(node_num);
+  std::vector<int> actual_sizes(node_num, 0);
+  ((GraphTable *)table)
+      ->random_sample_neighboors(node_data, sample_size, buffers, actual_sizes);
+
+  cntl->response_attachment().append(&node_num, sizeof(size_t));
+  cntl->response_attachment().append(actual_sizes.data(),
+                                     sizeof(int) * node_num);
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    cntl->response_attachment().append(buffers[idx].get(), actual_sizes[idx]);
+  }
+  return 0;
+}
+int32_t GraphBrpcService::graph_random_sample_nodes(
+    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
+    brpc::Controller *cntl) {
+  size_t size = *(uint64_t *)(request.params(0).c_str());
+  std::unique_ptr<char[]> buffer;
+  int actual_size;
+  if (((GraphTable *)table)->random_sample_nodes(size, buffer, actual_size) ==
+      0) {
+    cntl->response_attachment().append(buffer.get(), actual_size);
+  } else
+    cntl->response_attachment().append(NULL, 0);
+
+  return 0;
+}
+
+int32_t GraphBrpcService::graph_get_node_feat(Table *table,
+                                              const PsRequestMessage &request,
+                                              PsResponseMessage &response,
+                                              brpc::Controller *cntl) {
+  CHECK_TABLE_EXIST(table, request, response)
+  if (request.params_size() < 2) {
+    set_response_code(
+        response, -1,
+        "graph_get_node_feat request requires at least 2 arguments");
+    return 0;
+  }
+  size_t node_num = request.params(0).size() / sizeof(uint64_t);
+  uint64_t *node_data = (uint64_t *)(request.params(0).c_str());
+  std::vector<uint64_t> node_ids(node_data, node_data + node_num);
+
+  std::vector<std::string> feature_names =
+      paddle::string::split_string<std::string>(request.params(1), "\t");
+
+  std::vector<std::vector<std::string>> feature(
+      feature_names.size(), std::vector<std::string>(node_num));
+
+  ((GraphTable *)table)->get_node_feat(node_ids, feature_names, feature);
+
+  for (size_t feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+    for (size_t node_idx = 0; node_idx < node_num; ++node_idx) {
+      size_t feat_len = feature[feat_idx][node_idx].size();
+      cntl->response_attachment().append(&feat_len, sizeof(size_t));
+      cntl->response_attachment().append(feature[feat_idx][node_idx].data(),
+                                         feat_len);
+    }
+  }
+
+  return 0;
+}
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h
new file mode 100644
index 00000000000000..32c572f9e6c2bf
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_brpc_server.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "brpc/channel.h"
+#include "brpc/controller.h"
+#include "brpc/server.h"
+
+#include <memory>
+#include <vector>
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/service/server.h"
+#include "paddle/fluid/distributed/table/common_graph_table.h"
+#include "paddle/fluid/distributed/table/table.h"
+namespace paddle {
+namespace distributed {
+class GraphBrpcServer : public PSServer {
+ public:
+  GraphBrpcServer() {}
+  virtual ~GraphBrpcServer() {}
+  PsBaseService *get_service() { return _service.get(); }
+  virtual uint64_t start(const std::string &ip, uint32_t port);
+  virtual int32_t stop() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    if (stoped_) return 0;
+    stoped_ = true;
+    // cv_.notify_all();
+    _server.Stop(1000);
+    _server.Join();
+    return 0;
+  }
+  virtual int32_t port();
+
+  std::condition_variable *export_cv() { return &cv_; }
+
+ private:
+  virtual int32_t initialize();
+  mutable std::mutex mutex_;
+  std::condition_variable cv_;
+  bool stoped_ = false;
+  brpc::Server _server;
+  std::shared_ptr<PsBaseService> _service;
+  std::vector<std::shared_ptr<brpc::Channel>> _pserver_channels;
+};
+
+class GraphBrpcService;
+
+typedef int32_t (GraphBrpcService::*serviceFunc)(
+    Table *table, const PsRequestMessage &request, PsResponseMessage &response,
+    brpc::Controller *cntl);
+
+class GraphBrpcService : public PsBaseService {
+ public:
+  virtual int32_t initialize() override;
+
+  virtual void service(::google::protobuf::RpcController *controller,
+                       const PsRequestMessage *request,
+                       PsResponseMessage *response,
+                       ::google::protobuf::Closure *done) override;
+
+ protected:
+  std::unordered_map<int32_t, serviceFunc> _service_handler_map;
+  int32_t initialize_shard_info();
+  int32_t pull_graph_list(Table *table, const PsRequestMessage &request,
+                          PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t graph_random_sample_neighboors(Table *table,
+                                         const PsRequestMessage &request,
+                                         PsResponseMessage &response,
+                                         brpc::Controller *cntl);
+  int32_t graph_random_sample_nodes(Table *table,
+                                    const PsRequestMessage &request,
+                                    PsResponseMessage &response,
+                                    brpc::Controller *cntl);
+  int32_t graph_get_node_feat(Table *table, const PsRequestMessage &request,
+                              PsResponseMessage &response,
+                              brpc::Controller *cntl);
+  int32_t barrier(Table *table, const PsRequestMessage &request,
+                  PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t load_one_table(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t load_all_table(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t stop_server(Table *table, const PsRequestMessage &request,
+                      PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t start_profiler(Table *table, const PsRequestMessage &request,
+                         PsResponseMessage &response, brpc::Controller *cntl);
+  int32_t stop_profiler(Table *table, const PsRequestMessage &request,
+                        PsResponseMessage &response, brpc::Controller *cntl);
+
+  int32_t print_table_stat(Table *table, const PsRequestMessage &request,
+                           PsResponseMessage &response, brpc::Controller *cntl);
+
+ private:
+  bool _is_initialize_shard_info;
+  std::mutex _initialize_shard_mutex;
+  std::unordered_map<int32_t, serviceHandlerFunc> _msg_handler_map;
+  std::vector<float> _ori_values;
+  const int sample_nodes_ranges = 23;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/service/graph_py_service.cc b/paddle/fluid/distributed/service/graph_py_service.cc
new file mode 100644
index 00000000000000..61e4e0cf7bb915
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_py_service.cc
@@ -0,0 +1,325 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/service/graph_py_service.h"
+#include <thread>  // NOLINT
+#include "butil/endpoint.h"
+#include "iomanip"
+#include "paddle/fluid/distributed/table/table.h"
+#include "paddle/fluid/framework/archive.h"
+#include "paddle/fluid/platform/profiler.h"
+namespace paddle {
+namespace distributed {
+std::vector<std::string> GraphPyService::split(std::string& str,
+                                               const char pattern) {
+  std::vector<std::string> res;
+  std::stringstream input(str);
+  std::string temp;
+  while (std::getline(input, temp, pattern)) {
+    res.push_back(temp);
+  }
+  return res;
+}
+
+void GraphPyService::add_table_feat_conf(std::string table_name,
+                                         std::string feat_name,
+                                         std::string feat_dtype,
+                                         int32_t feat_shape) {
+  if (this->table_id_map.count(table_name)) {
+    this->table_feat_conf_table_name.push_back(table_name);
+    this->table_feat_conf_feat_name.push_back(feat_name);
+    this->table_feat_conf_feat_dtype.push_back(feat_dtype);
+    this->table_feat_conf_feat_shape.push_back(feat_shape);
+  }
+}
+
+void GraphPyService::set_up(std::string ips_str, int shard_num,
+                            std::vector<std::string> node_types,
+                            std::vector<std::string> edge_types) {
+  set_shard_num(shard_num);
+  set_num_node_types(node_types.size());
+
+  for (size_t table_id = 0; table_id < node_types.size(); table_id++) {
+    this->table_id_map[node_types[table_id]] = this->table_id_map.size();
+  }
+  for (size_t table_id = 0; table_id < edge_types.size(); table_id++) {
+    this->table_id_map[edge_types[table_id]] = this->table_id_map.size();
+  }
+  std::istringstream stream(ips_str);
+  std::string ip;
+  server_size = 0;
+  std::vector<std::string> ips_list = split(ips_str, ';');
+  int index = 0;
+  for (auto ips : ips_list) {
+    auto ip_and_port = split(ips, ':');
+    server_list.push_back(ip_and_port[0]);
+    port_list.push_back(ip_and_port[1]);
+    uint32_t port = stoul(ip_and_port[1]);
+    auto ph_host = paddle::distributed::PSHost(ip_and_port[0], port, index);
+    host_sign_list.push_back(ph_host.serialize_to_string());
+    index++;
+  }
+}
+void GraphPyClient::start_client() {
+  std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
+  dense_regions.insert(
+      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+  auto regions = dense_regions[0];
+  ::paddle::distributed::PSParameter worker_proto = GetWorkerProto();
+  paddle::distributed::PaddlePSEnvironment _ps_env;
+  auto servers_ = host_sign_list.size();
+  _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&host_sign_list, servers_);
+  worker_ptr = std::shared_ptr<paddle::distributed::GraphBrpcClient>(
+      (paddle::distributed::GraphBrpcClient*)
+          paddle::distributed::PSClientFactory::create(worker_proto));
+  worker_ptr->configure(worker_proto, dense_regions, _ps_env, client_id);
+  worker_ptr->set_shard_num(get_shard_num());
+}
+void GraphPyServer::start_server(bool block) {
+  std::string ip = server_list[rank];
+  uint32_t port = std::stoul(port_list[rank]);
+  ::paddle::distributed::PSParameter server_proto = this->GetServerProto();
+
+  auto _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&this->host_sign_list,
+                         this->host_sign_list.size());  // test
+  pserver_ptr = std::shared_ptr<paddle::distributed::GraphBrpcServer>(
+      (paddle::distributed::GraphBrpcServer*)
+          paddle::distributed::PSServerFactory::create(server_proto));
+  VLOG(0) << "pserver-ptr created ";
+  std::vector<framework::ProgramDesc> empty_vec;
+  framework::ProgramDesc empty_prog;
+  empty_vec.push_back(empty_prog);
+  pserver_ptr->configure(server_proto, _ps_env, rank, empty_vec);
+  pserver_ptr->start(ip, port);
+  std::condition_variable* cv_ = pserver_ptr->export_cv();
+  if (block) {
+    std::mutex mutex_;
+    std::unique_lock<std::mutex> lock(mutex_);
+    cv_->wait(lock);
+  }
+}
+::paddle::distributed::PSParameter GraphPyServer::GetServerProto() {
+  // Generate server proto desc
+  ::paddle::distributed::PSParameter server_fleet_desc;
+  ::paddle::distributed::ServerParameter* server_proto =
+      server_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  for (auto& tuple : this->table_id_map) {
+    VLOG(0) << " make a new table " << tuple.second;
+    ::paddle::distributed::TableParameter* sparse_table_proto =
+        downpour_server_proto->add_downpour_table_param();
+    std::vector<std::string> feat_name;
+    std::vector<std::string> feat_dtype;
+    std::vector<int32_t> feat_shape;
+    for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) {
+      if (tuple.first == table_feat_conf_table_name[i]) {
+        feat_name.push_back(table_feat_conf_feat_name[i]);
+        feat_dtype.push_back(table_feat_conf_feat_dtype[i]);
+        feat_shape.push_back(table_feat_conf_feat_shape[i]);
+      }
+    }
+    std::string table_type;
+    if (tuple.second < this->num_node_types) {
+      table_type = "node";
+    } else {
+      table_type = "edge";
+    }
+
+    GetDownpourSparseTableProto(sparse_table_proto, tuple.second, tuple.first,
+                                table_type, feat_name, feat_dtype, feat_shape);
+  }
+
+  return server_fleet_desc;
+}
+
+::paddle::distributed::PSParameter GraphPyClient::GetWorkerProto() {
+  ::paddle::distributed::PSParameter worker_fleet_desc;
+  ::paddle::distributed::WorkerParameter* worker_proto =
+      worker_fleet_desc.mutable_worker_param();
+
+  ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto =
+      worker_proto->mutable_downpour_worker_param();
+
+  for (auto& tuple : this->table_id_map) {
+    VLOG(0) << " make a new table " << tuple.second;
+    ::paddle::distributed::TableParameter* worker_sparse_table_proto =
+        downpour_worker_proto->add_downpour_table_param();
+    std::vector<std::string> feat_name;
+    std::vector<std::string> feat_dtype;
+    std::vector<int32_t> feat_shape;
+    for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) {
+      if (tuple.first == table_feat_conf_table_name[i]) {
+        feat_name.push_back(table_feat_conf_feat_name[i]);
+        feat_dtype.push_back(table_feat_conf_feat_dtype[i]);
+        feat_shape.push_back(table_feat_conf_feat_shape[i]);
+      }
+    }
+    std::string table_type;
+    if (tuple.second < this->num_node_types) {
+      table_type = "node";
+    } else {
+      table_type = "edge";
+    }
+
+    GetDownpourSparseTableProto(worker_sparse_table_proto, tuple.second,
+                                tuple.first, table_type, feat_name, feat_dtype,
+                                feat_shape);
+  }
+
+  ::paddle::distributed::ServerParameter* server_proto =
+      worker_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  for (auto& tuple : this->table_id_map) {
+    VLOG(0) << " make a new table " << tuple.second;
+    ::paddle::distributed::TableParameter* sparse_table_proto =
+        downpour_server_proto->add_downpour_table_param();
+    std::vector<std::string> feat_name;
+    std::vector<std::string> feat_dtype;
+    std::vector<int32_t> feat_shape;
+    for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) {
+      if (tuple.first == table_feat_conf_table_name[i]) {
+        feat_name.push_back(table_feat_conf_feat_name[i]);
+        feat_dtype.push_back(table_feat_conf_feat_dtype[i]);
+        feat_shape.push_back(table_feat_conf_feat_shape[i]);
+      }
+    }
+    std::string table_type;
+    if (tuple.second < this->num_node_types) {
+      table_type = "node";
+    } else {
+      table_type = "edge";
+    }
+
+    GetDownpourSparseTableProto(sparse_table_proto, tuple.second, tuple.first,
+                                table_type, feat_name, feat_dtype, feat_shape);
+  }
+
+  return worker_fleet_desc;
+}
+void GraphPyClient::load_edge_file(std::string name, std::string filepath,
+                                   bool reverse) {
+  // 'e' means load edge
+  std::string params = "e";
+  if (reverse) {
+    // 'e<' means load edges from $2 to $1
+    params += "<";
+  } else {
+    // 'e>' means load edges from $1 to $2
+    params += ">";
+  }
+  if (this->table_id_map.count(name)) {
+    VLOG(0) << "loadding data with type " << name << " from " << filepath;
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        get_ps_client()->load(table_id, std::string(filepath), params);
+    status.wait();
+  }
+}
+
+void GraphPyClient::load_node_file(std::string name, std::string filepath) {
+  // 'n' means load nodes and 'node_type' follows
+  std::string params = "n" + name;
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        get_ps_client()->load(table_id, std::string(filepath), params);
+    status.wait();
+  }
+}
+std::vector<std::vector<std::pair<uint64_t, float>>>
+GraphPyClient::batch_sample_neighboors(std::string name,
+                                       std::vector<uint64_t> node_ids,
+                                       int sample_size) {
+  std::vector<std::vector<std::pair<uint64_t, float>>> v;
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        worker_ptr->batch_sample_neighboors(table_id, node_ids, sample_size, v);
+    status.wait();
+  }
+  return v;
+}
+
+std::vector<uint64_t> GraphPyClient::random_sample_nodes(std::string name,
+                                                         int server_index,
+                                                         int sample_size) {
+  std::vector<uint64_t> v;
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status =
+        worker_ptr->random_sample_nodes(table_id, server_index, sample_size, v);
+    status.wait();
+  }
+  return v;
+}
+
+// (name, dtype, ndarray)
+std::vector<std::vector<std::string>> GraphPyClient::get_node_feat(
+    std::string node_type, std::vector<uint64_t> node_ids,
+    std::vector<std::string> feature_names) {
+  std::vector<std::vector<std::string>> v(
+      feature_names.size(), std::vector<std::string>(node_ids.size()));
+  if (this->table_id_map.count(node_type)) {
+    uint32_t table_id = this->table_id_map[node_type];
+    auto status =
+        worker_ptr->get_node_feat(table_id, node_ids, feature_names, v);
+    status.wait();
+  }
+  return v;
+}
+
+std::vector<FeatureNode> GraphPyClient::pull_graph_list(std::string name,
+                                                        int server_index,
+                                                        int start, int size,
+                                                        int step) {
+  std::vector<FeatureNode> res;
+  if (this->table_id_map.count(name)) {
+    uint32_t table_id = this->table_id_map[name];
+    auto status = worker_ptr->pull_graph_list(table_id, server_index, start,
+                                              size, step, res);
+    status.wait();
+  }
+  return res;
+}
+
+void GraphPyClient::stop_server() {
+  VLOG(0) << "going to stop server";
+  std::unique_lock<std::mutex> lock(mutex_);
+  if (stoped_) return;
+  auto status = this->worker_ptr->stop_server();
+  if (status.get() == 0) stoped_ = true;
+}
+void GraphPyClient::finalize_worker() { this->worker_ptr->finalize_worker(); }
+}
+}
diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h
new file mode 100644
index 00000000000000..e185f23e3d240f
--- /dev/null
+++ b/paddle/fluid/distributed/service/graph_py_service.h
@@ -0,0 +1,178 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <unistd.h>
+#include <condition_variable>  // NOLINT
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_map>
+#include <vector>
+#include "google/protobuf/text_format.h"
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/service/service.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+namespace paddle {
+namespace distributed {
+class GraphPyService {
+ protected:
+  std::vector<std::string> server_list, port_list, host_sign_list;
+  int server_size, shard_num;
+  int num_node_types;
+  std::unordered_map<std::string, uint32_t> table_id_map;
+  std::vector<std::string> table_feat_conf_table_name;
+  std::vector<std::string> table_feat_conf_feat_name;
+  std::vector<std::string> table_feat_conf_feat_dtype;
+  std::vector<int32_t> table_feat_conf_feat_shape;
+
+  // std::thread *server_thread, *client_thread;
+
+  // std::shared_ptr<paddle::distributed::PSServer> pserver_ptr;
+
+  // std::shared_ptr<paddle::distributed::PSClient> worker_ptr;
+
+ public:
+  // std::shared_ptr<paddle::distributed::PSServer> get_ps_server() {
+  //   return pserver_ptr;
+  // }
+  // std::shared_ptr<paddle::distributed::PSClient> get_ps_client() {
+  //   return worker_ptr;
+  // }
+  int get_shard_num() { return shard_num; }
+  void set_shard_num(int shard_num) { this->shard_num = shard_num; }
+  void GetDownpourSparseTableProto(
+      ::paddle::distributed::TableParameter* sparse_table_proto,
+      uint32_t table_id, std::string table_name, std::string table_type,
+      std::vector<std::string> feat_name, std::vector<std::string> feat_dtype,
+      std::vector<int32_t> feat_shape) {
+    sparse_table_proto->set_table_id(table_id);
+    sparse_table_proto->set_table_class("GraphTable");
+    sparse_table_proto->set_shard_num(shard_num);
+    sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE);
+    ::paddle::distributed::TableAccessorParameter* accessor_proto =
+        sparse_table_proto->mutable_accessor();
+
+    ::paddle::distributed::CommonAccessorParameter* common_proto =
+        sparse_table_proto->mutable_common();
+
+    // Set GraphTable Parameter
+    common_proto->set_table_name(table_name);
+    common_proto->set_name(table_type);
+    for (size_t i = 0; i < feat_name.size(); i++) {
+      common_proto->add_params(feat_dtype[i]);
+      common_proto->add_dims(feat_shape[i]);
+      common_proto->add_attributes(feat_name[i]);
+    }
+
+    accessor_proto->set_accessor_class("CommMergeAccessor");
+  }
+
+  void set_server_size(int server_size) { this->server_size = server_size; }
+  void set_num_node_types(int num_node_types) {
+    this->num_node_types = num_node_types;
+  }
+  int get_server_size(int server_size) { return server_size; }
+  std::vector<std::string> split(std::string& str, const char pattern);
+  void set_up(std::string ips_str, int shard_num,
+              std::vector<std::string> node_types,
+              std::vector<std::string> edge_types);
+
+  void add_table_feat_conf(std::string node_type, std::string feat_name,
+                           std::string feat_dtype, int32_t feat_shape);
+};
+class GraphPyServer : public GraphPyService {
+ public:
+  GraphPyServer() {}
+  void set_up(std::string ips_str, int shard_num,
+              std::vector<std::string> node_types,
+              std::vector<std::string> edge_types, int rank) {
+    set_rank(rank);
+    GraphPyService::set_up(ips_str, shard_num, node_types, edge_types);
+  }
+  int get_rank() { return rank; }
+  void set_rank(int rank) { this->rank = rank; }
+
+  void start_server(bool block = true);
+  ::paddle::distributed::PSParameter GetServerProto();
+  std::shared_ptr<paddle::distributed::GraphBrpcServer> get_ps_server() {
+    return pserver_ptr;
+  }
+
+ protected:
+  int rank;
+  std::shared_ptr<paddle::distributed::GraphBrpcServer> pserver_ptr;
+  std::thread* server_thread;
+};
+class GraphPyClient : public GraphPyService {
+ public:
+  void set_up(std::string ips_str, int shard_num,
+              std::vector<std::string> node_types,
+              std::vector<std::string> edge_types, int client_id) {
+    set_client_id(client_id);
+    GraphPyService::set_up(ips_str, shard_num, node_types, edge_types);
+  }
+  std::shared_ptr<paddle::distributed::GraphBrpcClient> get_ps_client() {
+    return worker_ptr;
+  }
+  void bind_local_server(int local_channel_index, GraphPyServer& server) {
+    worker_ptr->set_local_channel(local_channel_index);
+    worker_ptr->set_local_graph_service(
+        (paddle::distributed::GraphBrpcService*)server.get_ps_server()
+            ->get_service());
+  }
+  void stop_server();
+  void finalize_worker();
+  void load_edge_file(std::string name, std::string filepath, bool reverse);
+  void load_node_file(std::string name, std::string filepath);
+  int get_client_id() { return client_id; }
+  void set_client_id(int client_id) { this->client_id = client_id; }
+  void start_client();
+  std::vector<std::vector<std::pair<uint64_t, float>>> batch_sample_neighboors(
+      std::string name, std::vector<uint64_t> node_ids, int sample_size);
+  std::vector<uint64_t> random_sample_nodes(std::string name, int server_index,
+                                            int sample_size);
+  std::vector<std::vector<std::string>> get_node_feat(
+      std::string node_type, std::vector<uint64_t> node_ids,
+      std::vector<std::string> feature_names);
+  std::vector<FeatureNode> pull_graph_list(std::string name, int server_index,
+                                           int start, int size, int step = 1);
+  ::paddle::distributed::PSParameter GetWorkerProto();
+
+ protected:
+  mutable std::mutex mutex_;
+  int client_id;
+  std::shared_ptr<paddle::distributed::GraphBrpcClient> worker_ptr;
+  std::thread* client_thread;
+  bool stoped_ = false;
+};
+}
+}
diff --git a/paddle/fluid/distributed/service/ps_client.cc b/paddle/fluid/distributed/service/ps_client.cc
index 095b5dee0b28e4..3f78908baa3b1d 100644
--- a/paddle/fluid/distributed/service/ps_client.cc
+++ b/paddle/fluid/distributed/service/ps_client.cc
@@ -15,12 +15,13 @@
 #include "paddle/fluid/distributed/service/ps_client.h"
 #include "glog/logging.h"
 #include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
 #include "paddle/fluid/distributed/table/table.h"
 
 namespace paddle {
 namespace distributed {
 REGISTER_PSCORE_CLASS(PSClient, BrpcPsClient);
-
+REGISTER_PSCORE_CLASS(PSClient, GraphBrpcClient);
 int32_t PSClient::configure(
     const PSParameter &config,
     const std::map<uint64_t, std::vector<paddle::distributed::Region>> &regions,
@@ -78,9 +79,8 @@ PSClient *PSClientFactory::create(const PSParameter &ps_config) {
   }
 
   TableManager::instance().initialize();
-  LOG(INFO) << "Create PSClient[" << service_param.client_class()
-            << "] success";
+  VLOG(3) << "Create PSClient[" << service_param.client_class() << "] success";
   return client;
 }
 }  // namespace distributed
-}  // namespace paddle
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/fluid/distributed/service/ps_client.h b/paddle/fluid/distributed/service/ps_client.h
index 50f5802c63a253..1c8abc6c2e8dcd 100644
--- a/paddle/fluid/distributed/service/ps_client.h
+++ b/paddle/fluid/distributed/service/ps_client.h
@@ -24,16 +24,11 @@
 #include "paddle/fluid/distributed/service/env.h"
 #include "paddle/fluid/distributed/service/sendrecv.pb.h"
 #include "paddle/fluid/distributed/table/accessor.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
 
 namespace paddle {
 namespace distributed {
 
-class PSEnvironment;
-class PsRequestMessage;
-class PsResponseMessage;
-class ValueAccessor;
-struct Region;
-
 using paddle::distributed::PsRequestMessage;
 using paddle::distributed::PsResponseMessage;
 
@@ -117,10 +112,11 @@ class PSClient {
   // future结束前keys和values缓冲区不能再次使用
   // 整合多个线程请求的keys，聚集并分散发送到server
   // 返回结果后，遍历buffer并对values赋值
+  // is_training 用于区分请求是训练/预测，server端对于特征和准入会有不同的处理.
   virtual std::future<int32_t> pull_sparse(float **select_values,
                                            size_t table_id,
-                                           const uint64_t *keys,
-                                           size_t num) = 0;
+                                           const uint64_t *keys, size_t num,
+                                           bool is_training) = 0;
 
   virtual std::future<int32_t> print_table_stat(uint32_t table_id) = 0;
 
@@ -160,6 +156,7 @@ class PSClient {
     promise.set_value(-1);
     return fut;
   }
+
   // client2client消息处理，std::function<int32_t (int, int, const std::string&)
   // -> ret (msg_type, from_client_id, msg)
   typedef std::function<int32_t(int, int, const std::string &)> MsgHandlerFunc;
diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto
index 6250f84c98754d..d908c26da9870a 100644
--- a/paddle/fluid/distributed/service/sendrecv.proto
+++ b/paddle/fluid/distributed/service/sendrecv.proto
@@ -48,6 +48,10 @@ enum PsCmdID {
   PS_START_PROFILER = 27;
   PS_STOP_PROFILER = 28;
   PS_PUSH_GLOBAL_STEP = 29;
+  PS_PULL_GRAPH_LIST = 30;
+  PS_GRAPH_SAMPLE_NEIGHBOORS = 31;
+  PS_GRAPH_SAMPLE_NODES = 32;
+  PS_GRAPH_GET_NODE_FEAT = 33;
 }
 
 message PsRequestMessage {
@@ -111,4 +115,4 @@ message MultiVariableMessage {
 service PsService {
   rpc service(PsRequestMessage) returns (PsResponseMessage);
   rpc SendAndRecvVariable(MultiVariableMessage) returns (MultiVariableMessage);
-};
\ No newline at end of file
+};
diff --git a/paddle/fluid/distributed/service/server.cc b/paddle/fluid/distributed/service/server.cc
index fc230a0b9c92e6..9324adad6979ed 100644
--- a/paddle/fluid/distributed/service/server.cc
+++ b/paddle/fluid/distributed/service/server.cc
@@ -16,6 +16,7 @@
 
 #include "glog/logging.h"
 #include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
 #include "paddle/fluid/distributed/table/table.h"
 
 namespace paddle {
@@ -23,6 +24,8 @@ namespace distributed {
 
 REGISTER_PSCORE_CLASS(PSServer, BrpcPsServer);
 REGISTER_PSCORE_CLASS(PsBaseService, BrpcPsService);
+REGISTER_PSCORE_CLASS(PSServer, GraphBrpcServer);
+REGISTER_PSCORE_CLASS(PsBaseService, GraphBrpcService);
 
 PSServer *PSServerFactory::create(const PSParameter &ps_config) {
   const auto &config = ps_config.server_param();
diff --git a/paddle/fluid/distributed/service/service.cc b/paddle/fluid/distributed/service/service.cc
index 3d0f94fac27750..2759e4614e66e1 100644
--- a/paddle/fluid/distributed/service/service.cc
+++ b/paddle/fluid/distributed/service/service.cc
@@ -47,7 +47,7 @@ paddle::distributed::PSParameter load_from_prototxt(
 }
 
 void PSCore::init_gflag(const std::string& gflags) {
-  LOG(INFO) << "Init With Gflags:" << gflags;
+  VLOG(3) << "Init With Gflags:" << gflags;
   std::vector<std::string> flags = paddle::string::split_string(gflags);
   if (flags.size() < 1) {
     flags.push_back("-max_body_size=314217728");
diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt
index 1e98e193d54ae6..dde1f5ae8ee3a1 100644
--- a/paddle/fluid/distributed/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/table/CMakeLists.txt
@@ -1,13 +1,19 @@
 set_property(GLOBAL PROPERTY TABLE_DEPS string_helper)
-
+set(graphDir graph)
 get_property(TABLE_DEPS GLOBAL PROPERTY TABLE_DEPS)
-
+set_source_files_properties(${graphDir}/graph_edge.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(graph_edge SRCS ${graphDir}/graph_edge.cc)
+set_source_files_properties(${graphDir}/graph_weighted_sampler.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(WeightedSampler SRCS ${graphDir}/graph_weighted_sampler.cc DEPS graph_edge)
+set_source_files_properties(${graphDir}/graph_node.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_library(graph_node SRCS ${graphDir}/graph_node.cc DEPS WeightedSampler)
 set_source_files_properties(common_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(common_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
-cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc DEPS ${TABLE_DEPS} device_context string_helper simple_threadpool xxhash generator)
+cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator)
 
 set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc
new file mode 100644
index 00000000000000..020bcdcc52ef4b
--- /dev/null
+++ b/paddle/fluid/distributed/table/common_graph_table.cc
@@ -0,0 +1,506 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/common_graph_table.h"
+#include <time.h>
+#include <algorithm>
+#include <set>
+#include <sstream>
+#include "paddle/fluid/distributed/common/utils.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace distributed {
+
+std::vector<Node *> GraphShard::get_batch(int start, int end, int step) {
+  if (start < 0) start = 0;
+  std::vector<Node *> res;
+  for (int pos = start; pos < std::min(end, (int)bucket.size()); pos += step) {
+    res.push_back(bucket[pos]);
+  }
+  return res;
+}
+
+size_t GraphShard::get_size() { return bucket.size(); }
+
+GraphNode *GraphShard::add_graph_node(uint64_t id) {
+  if (node_location.find(id) == node_location.end()) {
+    node_location[id] = bucket.size();
+    bucket.push_back(new GraphNode(id));
+  }
+  return (GraphNode *)bucket[node_location[id]];
+}
+
+FeatureNode *GraphShard::add_feature_node(uint64_t id) {
+  if (node_location.find(id) == node_location.end()) {
+    node_location[id] = bucket.size();
+    bucket.push_back(new FeatureNode(id));
+  }
+  return (FeatureNode *)bucket[node_location[id]];
+}
+
+void GraphShard::add_neighboor(uint64_t id, uint64_t dst_id, float weight) {
+  find_node(id)->add_edge(dst_id, weight);
+}
+
+Node *GraphShard::find_node(uint64_t id) {
+  auto iter = node_location.find(id);
+  return iter == node_location.end() ? nullptr : bucket[iter->second];
+}
+
+int32_t GraphTable::load(const std::string &path, const std::string &param) {
+  bool load_edge = (param[0] == 'e');
+  bool load_node = (param[0] == 'n');
+  if (load_edge) {
+    bool reverse_edge = (param[1] == '<');
+    return this->load_edges(path, reverse_edge);
+  }
+  if (load_node) {
+    std::string node_type = param.substr(1);
+    return this->load_nodes(path, node_type);
+  }
+  return 0;
+}
+
+int32_t GraphTable::get_nodes_ids_by_ranges(
+    std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res) {
+  int start = 0, end, index = 0, total_size = 0;
+  res.clear();
+  std::vector<std::future<std::vector<uint64_t>>> tasks;
+  // std::string temp = "";
+  // for(int i = 0;i < shards.size();i++)
+  //   temp+= std::to_string((int)shards[i].get_size()) + " ";
+  // VLOG(0)<<"range distribution "<<temp;
+  for (int i = 0; i < shards.size() && index < ranges.size(); i++) {
+    end = total_size + shards[i].get_size();
+    start = total_size;
+    while (start < end && index < ranges.size()) {
+      if (ranges[index].second <= start)
+        index++;
+      else if (ranges[index].first >= end) {
+        break;
+      } else {
+        int first = std::max(ranges[index].first, start);
+        int second = std::min(ranges[index].second, end);
+        start = second;
+        first -= total_size;
+        second -= total_size;
+        // VLOG(0)<<" FIND RANGE "<<i<<" "<<first<<" "<<second;
+        tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
+            [this, first, second, i]() -> std::vector<uint64_t> {
+              return shards[i].get_ids_by_range(first, second);
+            }));
+      }
+    }
+    total_size += shards[i].get_size();
+  }
+  for (int i = 0; i < tasks.size(); i++) {
+    auto vec = tasks[i].get();
+    for (auto &id : vec) {
+      res.push_back(id);
+      std::swap(res[rand() % res.size()], res[(int)res.size() - 1]);
+    }
+  }
+  return 0;
+}
+
+int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
+  auto paths = paddle::string::split_string<std::string>(path, ";");
+  int64_t count = 0;
+  int64_t valid_count = 0;
+  for (auto path : paths) {
+    std::ifstream file(path);
+    std::string line;
+    while (std::getline(file, line)) {
+      count++;
+      auto values = paddle::string::split_string<std::string>(line, "\t");
+      if (values.size() < 2) continue;
+      auto id = std::stoull(values[1]);
+
+      size_t shard_id = id % shard_num;
+      if (shard_id >= shard_end || shard_id < shard_start) {
+        VLOG(4) << "will not load " << id << " from " << path
+                << ", please check id distribution";
+        continue;
+      }
+
+      if (count % 1000000 == 0) {
+        VLOG(0) << count << " nodes are loaded from filepath";
+      }
+
+      std::string nt = values[0];
+      if (nt != node_type) {
+        continue;
+      }
+
+      size_t index = shard_id - shard_start;
+
+      auto node = shards[index].add_feature_node(id);
+
+      node->set_feature_size(feat_name.size());
+
+      for (size_t slice = 2; slice < values.size(); slice++) {
+        auto feat = this->parse_feature(values[slice]);
+        if (feat.first >= 0) {
+          node->set_feature(feat.first, feat.second);
+        } else {
+          VLOG(4) << "Node feature:  " << values[slice]
+                  << " not in feature_map.";
+        }
+      }
+      valid_count++;
+    }
+  }
+
+  VLOG(0) << valid_count << "/" << count << " nodes in type " << node_type
+          << " are loaded successfully in " << path;
+  return 0;
+}
+
+int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
+  auto paths = paddle::string::split_string<std::string>(path, ";");
+  int count = 0;
+  std::string sample_type = "random";
+  bool is_weighted = false;
+  int valid_count = 0;
+
+  for (auto path : paths) {
+    std::ifstream file(path);
+    std::string line;
+    while (std::getline(file, line)) {
+      auto values = paddle::string::split_string<std::string>(line, "\t");
+      count++;
+      if (values.size() < 2) continue;
+      auto src_id = std::stoull(values[0]);
+      auto dst_id = std::stoull(values[1]);
+      if (reverse_edge) {
+        std::swap(src_id, dst_id);
+      }
+      float weight = 1;
+      if (values.size() == 3) {
+        weight = std::stof(values[2]);
+        sample_type = "weighted";
+        is_weighted = true;
+      }
+
+      size_t src_shard_id = src_id % shard_num;
+
+      if (src_shard_id >= shard_end || src_shard_id < shard_start) {
+        VLOG(4) << "will not load " << src_id << " from " << path
+                << ", please check id distribution";
+        continue;
+      }
+      if (count % 1000000 == 0) {
+        VLOG(0) << count << " edges are loaded from filepath";
+      }
+
+      size_t index = src_shard_id - shard_start;
+      shards[index].add_graph_node(src_id)->build_edges(is_weighted);
+      shards[index].add_neighboor(src_id, dst_id, weight);
+      valid_count++;
+    }
+  }
+  VLOG(0) << valid_count << "/" << count << " edges are loaded successfully in "
+          << path;
+
+  // Build Sampler j
+
+  for (auto &shard : shards) {
+    auto bucket = shard.get_bucket();
+    for (int i = 0; i < bucket.size(); i++) {
+      bucket[i]->build_sampler(sample_type);
+    }
+  }
+  return 0;
+}
+
+Node *GraphTable::find_node(uint64_t id) {
+  size_t shard_id = id % shard_num;
+  if (shard_id >= shard_end || shard_id < shard_start) {
+    return nullptr;
+  }
+  size_t index = shard_id - shard_start;
+  Node *node = shards[index].find_node(id);
+  return node;
+}
+uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) {
+  return node_id % shard_num % shard_num_per_table % task_pool_size_;
+}
+int32_t GraphTable::random_sample_nodes(int sample_size,
+                                        std::unique_ptr<char[]> &buffer,
+                                        int &actual_size) {
+  bool need_feature = false;
+  int total_size = 0;
+  for (int i = 0; i < shards.size(); i++) {
+    total_size += shards[i].get_size();
+  }
+  if (sample_size > total_size) sample_size = total_size;
+  int range_num = random_sample_nodes_ranges;
+  if (range_num > sample_size) range_num = sample_size;
+  if (sample_size == 0 || range_num == 0) return 0;
+  std::vector<int> ranges_len, ranges_pos;
+  int remain = sample_size, last_pos = -1, num;
+  std::set<int> separator_set;
+  for (int i = 0; i < range_num - 1; i++) {
+    while (separator_set.find(num = rand() % (sample_size - 1)) !=
+           separator_set.end())
+      ;
+    separator_set.insert(num);
+  }
+  for (auto p : separator_set) {
+    ranges_len.push_back(p - last_pos);
+    last_pos = p;
+  }
+  ranges_len.push_back(sample_size - 1 - last_pos);
+  remain = total_size - sample_size + range_num;
+  separator_set.clear();
+  for (int i = 0; i < range_num; i++) {
+    while (separator_set.find(num = rand() % remain) != separator_set.end())
+      ;
+    separator_set.insert(num);
+  }
+  int used = 0, index = 0;
+  last_pos = -1;
+  for (auto p : separator_set) {
+    used += p - last_pos - 1;
+    last_pos = p;
+    ranges_pos.push_back(used);
+    used += ranges_len[index++];
+  }
+  std::vector<std::pair<int, int>> first_half, second_half;
+  int start_index = rand() % total_size;
+  for (int i = 0; i < ranges_len.size() && i < ranges_pos.size(); i++) {
+    if (ranges_pos[i] + ranges_len[i] - 1 + start_index < total_size)
+      first_half.push_back({ranges_pos[i] + start_index,
+                            ranges_pos[i] + ranges_len[i] + start_index});
+    else if (ranges_pos[i] + start_index >= total_size) {
+      second_half.push_back(
+          {ranges_pos[i] + start_index - total_size,
+           ranges_pos[i] + ranges_len[i] + start_index - total_size});
+    } else {
+      first_half.push_back({ranges_pos[i] + start_index, total_size});
+      second_half.push_back(
+          {0, ranges_pos[i] + ranges_len[i] + start_index - total_size});
+    }
+  }
+  for (auto &pair : first_half) second_half.push_back(pair);
+  std::vector<uint64_t> res;
+  get_nodes_ids_by_ranges(second_half, res);
+  actual_size = res.size() * sizeof(uint64_t);
+  buffer.reset(new char[actual_size]);
+  char *pointer = buffer.get();
+  memcpy(pointer, res.data(), actual_size);
+  return 0;
+}
+int32_t GraphTable::random_sample_neighboors(
+    uint64_t *node_ids, int sample_size,
+    std::vector<std::unique_ptr<char[]>> &buffers,
+    std::vector<int> &actual_sizes) {
+  size_t node_num = buffers.size();
+  std::vector<std::future<int>> tasks;
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    uint64_t &node_id = node_ids[idx];
+    std::unique_ptr<char[]> &buffer = buffers[idx];
+    int &actual_size = actual_sizes[idx];
+    tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
+        [&]() -> int {
+          Node *node = find_node(node_id);
+
+          if (node == nullptr) {
+            actual_size = 0;
+            return 0;
+          }
+          std::vector<int> res = node->sample_k(sample_size);
+          actual_size = res.size() * (Node::id_size + Node::weight_size);
+          int offset = 0;
+          uint64_t id;
+          float weight;
+          char *buffer_addr = new char[actual_size];
+          buffer.reset(buffer_addr);
+          for (int &x : res) {
+            id = node->get_neighbor_id(x);
+            weight = node->get_neighbor_weight(x);
+            memcpy(buffer_addr + offset, &id, Node::id_size);
+            offset += Node::id_size;
+            memcpy(buffer_addr + offset, &weight, Node::weight_size);
+            offset += Node::weight_size;
+          }
+          return 0;
+        }));
+  }
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    tasks[idx].get();
+  }
+  return 0;
+}
+
+int32_t GraphTable::get_node_feat(const std::vector<uint64_t> &node_ids,
+                                  const std::vector<std::string> &feature_names,
+                                  std::vector<std::vector<std::string>> &res) {
+  size_t node_num = node_ids.size();
+  std::vector<std::future<int>> tasks;
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    uint64_t node_id = node_ids[idx];
+    tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue(
+        [&, idx, node_id]() -> int {
+          Node *node = find_node(node_id);
+
+          if (node == nullptr) {
+            return 0;
+          }
+          for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) {
+            const std::string &feature_name = feature_names[feat_idx];
+            if (feat_id_map.find(feature_name) != feat_id_map.end()) {
+              // res[feat_idx][idx] =
+              // node->get_feature(feat_id_map[feature_name]);
+              auto feat = node->get_feature(feat_id_map[feature_name]);
+              res[feat_idx][idx] = feat;
+            }
+          }
+          return 0;
+        }));
+  }
+  for (size_t idx = 0; idx < node_num; ++idx) {
+    tasks[idx].get();
+  }
+  return 0;
+}
+
+std::pair<int32_t, std::string> GraphTable::parse_feature(
+    std::string feat_str) {
+  // Return (feat_id, btyes) if name are in this->feat_name, else return (-1,
+  // "")
+  auto fields = paddle::string::split_string<std::string>(feat_str, " ");
+  if (this->feat_id_map.count(fields[0])) {
+    int32_t id = this->feat_id_map[fields[0]];
+    std::string dtype = this->feat_dtype[id];
+    int32_t shape = this->feat_shape[id];
+    std::vector<std::string> values(fields.begin() + 1, fields.end());
+    if (dtype == "feasign") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), paddle::string::join_strings(values, ' '));
+    } else if (dtype == "string") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), paddle::string::join_strings(values, ' '));
+    } else if (dtype == "float32") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), FeatureNode::parse_value_to_bytes<float>(values));
+    } else if (dtype == "float64") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), FeatureNode::parse_value_to_bytes<double>(values));
+    } else if (dtype == "int32") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), FeatureNode::parse_value_to_bytes<int32_t>(values));
+    } else if (dtype == "int64") {
+      return std::make_pair<int32_t, std::string>(
+          int32_t(id), FeatureNode::parse_value_to_bytes<int64_t>(values));
+    }
+  }
+  return std::make_pair<int32_t, std::string>(-1, "");
+}
+
+int32_t GraphTable::pull_graph_list(int start, int total_size,
+                                    std::unique_ptr<char[]> &buffer,
+                                    int &actual_size, bool need_feature,
+                                    int step) {
+  if (start < 0) start = 0;
+  int size = 0, cur_size;
+  std::vector<std::future<std::vector<Node *>>> tasks;
+  for (size_t i = 0; i < shards.size() && total_size > 0; i++) {
+    cur_size = shards[i].get_size();
+    if (size + cur_size <= start) {
+      size += cur_size;
+      continue;
+    }
+    int count = std::min(1 + (size + cur_size - start - 1) / step, total_size);
+    int end = start + (count - 1) * step + 1;
+    tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue(
+        [this, i, start, end, step, size]() -> std::vector<Node *> {
+
+          return this->shards[i].get_batch(start - size, end - size, step);
+        }));
+    start += count * step;
+    total_size -= count;
+    size += cur_size;
+  }
+  for (size_t i = 0; i < tasks.size(); ++i) {
+    tasks[i].wait();
+  }
+  size = 0;
+  std::vector<std::vector<Node *>> res;
+  for (size_t i = 0; i < tasks.size(); i++) {
+    res.push_back(tasks[i].get());
+    for (size_t j = 0; j < res.back().size(); j++) {
+      size += res.back()[j]->get_size(need_feature);
+    }
+  }
+  char *buffer_addr = new char[size];
+  buffer.reset(buffer_addr);
+  int index = 0;
+  for (size_t i = 0; i < res.size(); i++) {
+    for (size_t j = 0; j < res[i].size(); j++) {
+      res[i][j]->to_buffer(buffer_addr + index, need_feature);
+      index += res[i][j]->get_size(need_feature);
+    }
+  }
+  actual_size = size;
+  return 0;
+}
+int32_t GraphTable::initialize() {
+  _shards_task_pool.resize(task_pool_size_);
+  for (size_t i = 0; i < _shards_task_pool.size(); ++i) {
+    _shards_task_pool[i].reset(new ::ThreadPool(1));
+  }
+  server_num = _shard_num;
+  // VLOG(0) << "in init graph table server num = " << server_num;
+  /*
+  _shard_num is actually server number here
+  when a server initialize its tables, it sets tables' _shard_num to server_num,
+  and _shard_idx to server
+  rank
+  */
+  auto common = _config.common();
+
+  this->table_name = common.table_name();
+  this->table_type = common.name();
+  VLOG(0) << " init graph table type " << this->table_type << " table name "
+          << this->table_name;
+  int feat_conf_size = static_cast<int>(common.attributes().size());
+  for (int i = 0; i < feat_conf_size; i++) {
+    auto &f_name = common.attributes()[i];
+    auto &f_shape = common.dims()[i];
+    auto &f_dtype = common.params()[i];
+    this->feat_name.push_back(f_name);
+    this->feat_shape.push_back(f_shape);
+    this->feat_dtype.push_back(f_dtype);
+    this->feat_id_map[f_name] = i;
+    VLOG(0) << "init graph table feat conf name:" << f_name
+            << " shape:" << f_shape << " dtype:" << f_dtype;
+  }
+
+  shard_num = _config.shard_num();
+  VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx"
+          << _shard_idx;
+  shard_num_per_table = sparse_local_shard_num(shard_num, server_num);
+  shard_start = _shard_idx * shard_num_per_table;
+  shard_end = shard_start + shard_num_per_table;
+  VLOG(0) << "in init graph table shard idx = " << _shard_idx << " shard_start "
+          << shard_start << " shard_end " << shard_end;
+  // shards.resize(shard_num_per_table);
+  shards = std::vector<GraphShard>(shard_num_per_table, GraphShard(shard_num));
+  return 0;
+}
+}
+};
diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h
new file mode 100644
index 00000000000000..8ddf3c8f904a6c
--- /dev/null
+++ b/paddle/fluid/distributed/table/common_graph_table.h
@@ -0,0 +1,147 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ThreadPool.h>
+#include <assert.h>
+#include <pthread.h>
+#include <list>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/distributed/table/accessor.h"
+#include "paddle/fluid/distributed/table/common_table.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/framework/rw_lock.h"
+#include "paddle/fluid/string/string_helper.h"
+namespace paddle {
+namespace distributed {
+class GraphShard {
+ public:
+  // static int bucket_low_bound;
+  // static int gcd(int s, int t) {
+  //   if (s % t == 0) return t;
+  //   return gcd(t, s % t);
+  // }
+  size_t get_size();
+  GraphShard() {}
+  GraphShard(int shard_num) {
+    this->shard_num = shard_num;
+    // bucket_size = init_bucket_size(shard_num);
+    // bucket.resize(bucket_size);
+  }
+  std::vector<Node *> &get_bucket() { return bucket; }
+  std::vector<Node *> get_batch(int start, int end, int step);
+  // int init_bucket_size(int shard_num) {
+  //   for (int i = bucket_low_bound;; i++) {
+  //     if (gcd(i, shard_num) == 1) return i;
+  //   }
+  //   return -1;
+  // }
+  std::vector<uint64_t> get_ids_by_range(int start, int end) {
+    std::vector<uint64_t> res;
+    for (int i = start; i < end && i < bucket.size(); i++) {
+      res.push_back(bucket[i]->get_id());
+    }
+    return res;
+  }
+  GraphNode *add_graph_node(uint64_t id);
+  FeatureNode *add_feature_node(uint64_t id);
+  Node *find_node(uint64_t id);
+  void add_neighboor(uint64_t id, uint64_t dst_id, float weight);
+  // std::unordered_map<uint64_t, std::list<GraphNode *>::iterator>
+  std::unordered_map<uint64_t, int> get_node_location() {
+    return node_location;
+  }
+
+ private:
+  std::unordered_map<uint64_t, int> node_location;
+  int shard_num;
+  std::vector<Node *> bucket;
+};
+class GraphTable : public SparseTable {
+ public:
+  GraphTable() {}
+  virtual ~GraphTable() {}
+  virtual int32_t pull_graph_list(int start, int size,
+                                  std::unique_ptr<char[]> &buffer,
+                                  int &actual_size, bool need_feature,
+                                  int step);
+
+  virtual int32_t random_sample_neighboors(
+      uint64_t *node_ids, int sample_size,
+      std::vector<std::unique_ptr<char[]>> &buffers,
+      std::vector<int> &actual_sizes);
+
+  int32_t random_sample_nodes(int sample_size, std::unique_ptr<char[]> &buffers,
+                              int &actual_sizes);
+
+  virtual int32_t get_nodes_ids_by_ranges(
+      std::vector<std::pair<int, int>> ranges, std::vector<uint64_t> &res);
+  virtual int32_t initialize();
+
+  int32_t load(const std::string &path, const std::string &param);
+
+  int32_t load_edges(const std::string &path, bool reverse);
+
+  int32_t load_nodes(const std::string &path, std::string node_type);
+
+  Node *find_node(uint64_t id);
+
+  virtual int32_t pull_sparse(float *values,
+                              const PullSparseValue &pull_value) {
+    return 0;
+  }
+
+  virtual int32_t push_sparse(const uint64_t *keys, const float *values,
+                              size_t num) {
+    return 0;
+  }
+
+  virtual void clear() {}
+  virtual int32_t flush() { return 0; }
+  virtual int32_t shrink(const std::string &param) { return 0; }
+  //指定保存路径
+  virtual int32_t save(const std::string &path, const std::string &converter) {
+    return 0;
+  }
+  virtual int32_t initialize_shard() { return 0; }
+  virtual uint32_t get_thread_pool_index(uint64_t node_id);
+  virtual std::pair<int32_t, std::string> parse_feature(std::string feat_str);
+
+  virtual int32_t get_node_feat(const std::vector<uint64_t> &node_ids,
+                                const std::vector<std::string> &feature_names,
+                                std::vector<std::vector<std::string>> &res);
+
+ protected:
+  std::vector<GraphShard> shards;
+  size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num;
+  const int task_pool_size_ = 11;
+  const int random_sample_nodes_ranges = 3;
+
+  std::vector<std::string> feat_name;
+  std::vector<std::string> feat_dtype;
+  std::vector<int32_t> feat_shape;
+  std::unordered_map<std::string, int32_t> feat_id_map;
+  std::string table_name;
+  std::string table_type;
+
+  std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
+};
+}  // namespace distributed
+};  // namespace paddle
diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc
index ffedbea14a0290..a25a90aa9a7c1f 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -254,7 +254,6 @@ int32_t CommonSparseTable::initialize_value() {
   }
 
   auto accessor = _config.accessor();
-
   std::vector<uint64_t> feasigns;
 
   for (size_t x = 0; x < accessor.fea_dim(); ++x) {
@@ -271,9 +270,14 @@ int32_t CommonSparseTable::initialize_value() {
     std::vector<uint64_t> ids(bucket_feasigns);
     std::copy(feasigns.begin() + buckets[x], feasigns.begin() + buckets[x + 1],
               ids.begin());
+
+    std::vector<uint32_t> fres;
+    fres.resize(ids.size(), 1);
+
+    auto pull_value = PullSparseValue(ids, fres, param_dim_);
     std::vector<float> pulls;
     pulls.resize(bucket_feasigns * param_dim_);
-    pull_sparse(pulls.data(), ids.data(), bucket_feasigns);
+    pull_sparse(pulls.data(), pull_value);
   }
 
   return 0;
@@ -399,32 +403,36 @@ int32_t CommonSparseTable::pour() {
   return 0;
 }
 
-int32_t CommonSparseTable::pull_sparse(float* pull_values, const uint64_t* keys,
-                                       size_t num) {
+int32_t CommonSparseTable::pull_sparse(float* pull_values,
+                                       const PullSparseValue& pull_value) {
   rwlock_->RDLock();
 
-  std::vector<std::vector<uint64_t>> offset_bucket;
-  offset_bucket.resize(task_pool_size_);
-
-  for (int x = 0; x < num; ++x) {
-    auto y = keys[x] % task_pool_size_;
-    offset_bucket[y].push_back(x);
-  }
-
-  std::vector<std::future<int>> tasks(task_pool_size_);
+  auto shard_num = task_pool_size_;
+  std::vector<std::future<int>> tasks(shard_num);
 
-  for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) {
+  for (int shard_id = 0; shard_id < shard_num; ++shard_id) {
     tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
-        [this, shard_id, &keys, &offset_bucket, &pull_values]() -> int {
+        [this, shard_id, shard_num, &pull_value, &pull_values]() -> int {
           auto& block = shard_values_[shard_id];
-          auto& offsets = offset_bucket[shard_id];
 
-          for (int i = 0; i < offsets.size(); ++i) {
-            auto offset = offsets[i];
-            auto id = keys[offset];
-            auto* value = block->Init(id);
-            std::copy_n(value + param_offset_, param_dim_,
-                        pull_values + param_dim_ * offset);
+          std::vector<int> offsets;
+          pull_value.Fission(shard_id, shard_num, &offsets);
+
+          if (pull_value.is_training_) {
+            for (auto& offset : offsets) {
+              auto feasign = pull_value.feasigns_[offset];
+              auto frequencie = pull_value.frequencies_[offset];
+              auto* value = block->Init(feasign, true, frequencie);
+              std::copy_n(value + param_offset_, param_dim_,
+                          pull_values + param_dim_ * offset);
+            }
+          } else {
+            for (auto& offset : offsets) {
+              auto feasign = pull_value.feasigns_[offset];
+              auto* value = block->Init(feasign, false);
+              std::copy_n(value + param_offset_, param_dim_,
+                          pull_values + param_dim_ * offset);
+            }
           }
 
           return 0;
diff --git a/paddle/fluid/distributed/table/common_sparse_table.h b/paddle/fluid/distributed/table/common_sparse_table.h
index 98cbf2b4a21057..31f4dabcdfdd74 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.h
+++ b/paddle/fluid/distributed/table/common_sparse_table.h
@@ -61,8 +61,7 @@ class CommonSparseTable : public SparseTable {
   int32_t save(const std::string& path, const std::string& param);
 
   virtual std::pair<int64_t, int64_t> print_table_stat();
-  virtual int32_t pull_sparse(float* pull_values, const uint64_t* keys,
-                              size_t num);
+  virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
 
   virtual int32_t push_sparse(const uint64_t* keys, const float* values,
                               size_t num);
diff --git a/paddle/fluid/distributed/table/common_table.h b/paddle/fluid/distributed/table/common_table.h
index dc3cfa75ff6898..bc7f17f5f24579 100644
--- a/paddle/fluid/distributed/table/common_table.h
+++ b/paddle/fluid/distributed/table/common_table.h
@@ -98,8 +98,8 @@ class DenseTable : public Table {
   virtual ~DenseTable() {}
 
   virtual void *get_shard(size_t shard_idx) { return 0; }
-  int32_t pull_sparse(float *values, const uint64_t *keys,
-                      size_t num) override {
+  int32_t pull_sparse(float *values,
+                      const PullSparseValue &pull_value) override {
     return 0;
   }
   int32_t push_sparse(const uint64_t *keys, const float *values,
@@ -123,8 +123,8 @@ class BarrierTable : public Table {
 
   int32_t push_dense(const float *values, size_t num) override { return 0; }
 
-  int32_t pull_sparse(float *values, const uint64_t *keys,
-                      size_t num) override {
+  int32_t pull_sparse(float *values,
+                      const PullSparseValue &pull_value) override {
     return 0;
   }
   int32_t push_sparse(const uint64_t *keys, const float *values,
diff --git a/paddle/fluid/distributed/table/depends/dense.h b/paddle/fluid/distributed/table/depends/dense.h
index a2acdfd20148ac..8079003d1bf8f6 100644
--- a/paddle/fluid/distributed/table/depends/dense.h
+++ b/paddle/fluid/distributed/table/depends/dense.h
@@ -89,7 +89,6 @@ class DSGD : public DenseOptimizer {
 
     auto blas = GetBlas<float>();
     float lr = *(global_learning_rate_) * (*learning_rate);
-    VLOG(4) << "DSGD LearningRate: " << lr;
     blas.VCOPY(update_numel, update_values + begin, grads.data());
     blas.SCAL(update_numel, lr, grads.data());
     blas.VSUB(update_numel, param + begin, grads.data(), param + begin);
@@ -157,7 +156,6 @@ class DAdam : public DenseOptimizer {
     beta2_pow[0] = beta2_pow[0] * beta2;
 
     float lr_ = *(global_learning_rate_)*learning_rate[0];
-    VLOG(4) << "DAdam LearningRate: " << lr_;
     lr_ *= sqrt(1 - beta2_pow[0]) / (1 - beta1_pow[0]);
 
     float* tmp_ = tmp.data();
diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h
index ba79a381a6d881..cb077033cad42d 100644
--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h
@@ -155,7 +155,8 @@ class ValueBlock {
   }
 
   // pull
-  float *Init(const uint64_t &id, const bool with_update = true) {
+  float *Init(const uint64_t &id, const bool with_update = true,
+              const int counter = 1) {
     if (!Has(id)) {
       values_[id] = std::make_shared<VALUE>(value_length_);
     }
@@ -163,16 +164,16 @@ class ValueBlock {
     auto &value = values_.at(id);
 
     if (with_update) {
-      AttrUpdate(value);
+      AttrUpdate(value, counter);
     }
 
     return value->data_.data();
   }
 
-  void AttrUpdate(std::shared_ptr<VALUE> value) {
+  void AttrUpdate(std::shared_ptr<VALUE> value, const int counter) {
     // update state
     value->unseen_days_ = 0;
-    ++value->count_;
+    value->count_ += counter;
 
     if (!value->is_entry_) {
       value->is_entry_ = entry_func_(value);
diff --git a/paddle/fluid/distributed/table/depends/sparse.h b/paddle/fluid/distributed/table/depends/sparse.h
index 672d6e7d396874..0e1d7ef03c129c 100644
--- a/paddle/fluid/distributed/table/depends/sparse.h
+++ b/paddle/fluid/distributed/table/depends/sparse.h
@@ -110,7 +110,6 @@ class SSGD : public SparseOptimizer {
       auto* value = block->Get(id);
 
       float learning_rate = *(global_learning_rate_) * (value + lr_offset)[0];
-      VLOG(4) << "SSGD LearningRate: " << learning_rate;
       float* param = value + param_offset;
 
       std::vector<float> grads;
@@ -166,7 +165,6 @@ class SAdam : public SparseOptimizer {
       if (!block->GetEntry(id)) continue;
       auto* values = block->Get(id);
       float lr_ = *(global_learning_rate_) * (values + lr_offset)[0];
-      VLOG(4) << "SAdam LearningRate: " << lr_;
       float* param = values + param_offset;
       float* moment1 = values + m1_offset;
       float* moment2 = values + m2_offset;
diff --git a/paddle/fluid/distributed/table/depends/sparse_utils.h b/paddle/fluid/distributed/table/depends/sparse_utils.h
new file mode 100644
index 00000000000000..c185dd17d792e4
--- /dev/null
+++ b/paddle/fluid/distributed/table/depends/sparse_utils.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace distributed {
+
+struct PullSparseValue {
+  explicit PullSparseValue(int numel, int dim)
+      : numel_(numel),
+        dim_(dim),
+        is_training_(true),
+        feasigns_(nullptr),
+        frequencies_(nullptr) {}
+
+  explicit PullSparseValue(std::vector<uint64_t> feasigns,
+                           std::vector<uint32_t> frequencies, int dim) {
+    numel_ = feasigns.size();
+    dim_ = dim;
+    is_training_ = true;
+    feasigns_ = feasigns.data();
+    frequencies_ = frequencies.data();
+  }
+
+  void DeserializeFromBytes(void* bytes) {
+    /*
+    |---isTraining--------------|
+    |---8*{num}B(keysData)------|
+    |---4*{num}B(Frequencies)---|
+    */
+    auto* begin = reinterpret_cast<char*>(bytes);
+    is_training_ = reinterpret_cast<bool*>(begin)[0];
+    feasigns_ = reinterpret_cast<uint64_t*>(begin + sizeof(bool));
+    frequencies_ = reinterpret_cast<uint32_t*>(begin + sizeof(bool) +
+                                               sizeof(uint64_t) * numel_);
+  }
+
+  void Fission(const int shard_id, const int shard_num,
+               std::vector<int>* offset_shard) const {
+    offset_shard->reserve(numel_ / shard_num + 1);
+    for (int x = 0; x < numel_; ++x) {
+      if (feasigns_[x] % shard_num == shard_id) {
+        offset_shard->push_back(x);
+      }
+    }
+  }
+
+  int numel_;
+  int dim_;
+  bool is_training_;
+  uint64_t* feasigns_;
+  uint32_t* frequencies_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/table/graph/graph_edge.cc b/paddle/fluid/distributed/table/graph/graph_edge.cc
new file mode 100644
index 00000000000000..0ab0d5a76d6715
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_edge.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/graph/graph_edge.h"
+#include <cstring>
+namespace paddle {
+namespace distributed {
+
+void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+  id_arr.push_back(id);
+}
+
+void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+  id_arr.push_back(id);
+  weight_arr.push_back(weight);
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph/graph_edge.h b/paddle/fluid/distributed/table/graph/graph_edge.h
new file mode 100644
index 00000000000000..3dfe5a6f357a7c
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_edge.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+namespace paddle {
+namespace distributed {
+
+class GraphEdgeBlob {
+ public:
+  GraphEdgeBlob() {}
+  virtual ~GraphEdgeBlob() {}
+  size_t size() { return id_arr.size(); }
+  virtual void add_edge(uint64_t id, float weight);
+  uint64_t get_id(int idx) { return id_arr[idx]; }
+  virtual float get_weight(int idx) { return 1; }
+
+ protected:
+  std::vector<uint64_t> id_arr;
+};
+
+class WeightedGraphEdgeBlob : public GraphEdgeBlob {
+ public:
+  WeightedGraphEdgeBlob() {}
+  virtual ~WeightedGraphEdgeBlob() {}
+  virtual void add_edge(uint64_t id, float weight);
+  virtual float get_weight(int idx) { return weight_arr[idx]; }
+
+ protected:
+  std::vector<float> weight_arr;
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/graph/graph_node.cc b/paddle/fluid/distributed/table/graph/graph_node.cc
new file mode 100644
index 00000000000000..816d31b979072c
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_node.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include <cstring>
+namespace paddle {
+namespace distributed {
+
+GraphNode::~GraphNode() {
+  if (sampler != nullptr) {
+    delete sampler;
+    sampler = nullptr;
+  }
+  if (edges != nullptr) {
+    delete edges;
+    edges = nullptr;
+  }
+}
+
+int Node::weight_size = sizeof(float);
+int Node::id_size = sizeof(uint64_t);
+int Node::int_size = sizeof(int);
+
+int Node::get_size(bool need_feature) { return id_size + int_size; }
+
+void Node::to_buffer(char* buffer, bool need_feature) {
+  memcpy(buffer, &id, id_size);
+  buffer += id_size;
+
+  int feat_num = 0;
+  memcpy(buffer, &feat_num, sizeof(int));
+}
+
+void Node::recover_from_buffer(char* buffer) { memcpy(&id, buffer, id_size); }
+
+int FeatureNode::get_size(bool need_feature) {
+  int size = id_size + int_size;  // id, feat_num
+  if (need_feature) {
+    size += feature.size() * int_size;
+    for (const std::string& fea : feature) {
+      size += fea.size();
+    }
+  }
+  return size;
+}
+
+void GraphNode::build_edges(bool is_weighted) {
+  if (edges == nullptr) {
+    if (is_weighted == true) {
+      edges = new WeightedGraphEdgeBlob();
+    } else {
+      edges = new GraphEdgeBlob();
+    }
+  }
+}
+void GraphNode::build_sampler(std::string sample_type) {
+  if (sample_type == "random") {
+    sampler = new RandomSampler();
+  } else if (sample_type == "weighted") {
+    sampler = new WeightedSampler();
+  }
+  sampler->build(edges);
+}
+void FeatureNode::to_buffer(char* buffer, bool need_feature) {
+  memcpy(buffer, &id, id_size);
+  buffer += id_size;
+
+  int feat_num = 0;
+  int feat_len;
+  if (need_feature) {
+    feat_num += feature.size();
+    memcpy(buffer, &feat_num, sizeof(int));
+    buffer += sizeof(int);
+    for (int i = 0; i < feat_num; ++i) {
+      feat_len = feature[i].size();
+      memcpy(buffer, &feat_len, sizeof(int));
+      buffer += sizeof(int);
+      memcpy(buffer, feature[i].c_str(), feature[i].size());
+      buffer += feature[i].size();
+    }
+  } else {
+    memcpy(buffer, &feat_num, sizeof(int));
+  }
+}
+void FeatureNode::recover_from_buffer(char* buffer) {
+  int feat_num, feat_len;
+  memcpy(&id, buffer, id_size);
+  buffer += id_size;
+
+  memcpy(&feat_num, buffer, sizeof(int));
+  buffer += sizeof(int);
+
+  feature.clear();
+  for (int i = 0; i < feat_num; ++i) {
+    memcpy(&feat_len, buffer, sizeof(int));
+    buffer += sizeof(int);
+
+    char str[feat_len + 1];
+    memcpy(str, buffer, feat_len);
+    buffer += feat_len;
+    str[feat_len] = '\0';
+    feature.push_back(std::string(str));
+  }
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph/graph_node.h b/paddle/fluid/distributed/table/graph/graph_node.h
new file mode 100644
index 00000000000000..8ad795ac97b549
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_node.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h"
+namespace paddle {
+namespace distributed {
+
+class Node {
+ public:
+  Node() {}
+  Node(uint64_t id) : id(id) {}
+  virtual ~Node() {}
+  static int id_size, int_size, weight_size;
+  uint64_t get_id() { return id; }
+  void set_id(uint64_t id) { this->id = id; }
+
+  virtual void build_edges(bool is_weighted) {}
+  virtual void build_sampler(std::string sample_type) {}
+  virtual void add_edge(uint64_t id, float weight) {}
+  virtual std::vector<int> sample_k(int k) { return std::vector<int>(); }
+  virtual uint64_t get_neighbor_id(int idx) { return 0; }
+  virtual float get_neighbor_weight(int idx) { return 1.; }
+
+  virtual int get_size(bool need_feature);
+  virtual void to_buffer(char *buffer, bool need_feature);
+  virtual void recover_from_buffer(char *buffer);
+  virtual std::string get_feature(int idx) { return std::string(""); }
+  virtual void set_feature(int idx, std::string str) {}
+  virtual void set_feature_size(int size) {}
+  virtual int get_feature_size() { return 0; }
+
+ protected:
+  uint64_t id;
+};
+
+class GraphNode : public Node {
+ public:
+  GraphNode() : Node(), sampler(nullptr), edges(nullptr) {}
+  GraphNode(uint64_t id) : Node(id), sampler(nullptr), edges(nullptr) {}
+  virtual ~GraphNode();
+  virtual void build_edges(bool is_weighted);
+  virtual void build_sampler(std::string sample_type);
+  virtual void add_edge(uint64_t id, float weight) {
+    edges->add_edge(id, weight);
+  }
+  virtual std::vector<int> sample_k(int k) { return sampler->sample_k(k); }
+  virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); }
+  virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); }
+
+ protected:
+  Sampler *sampler;
+  GraphEdgeBlob *edges;
+};
+
+class FeatureNode : public Node {
+ public:
+  FeatureNode() : Node() {}
+  FeatureNode(uint64_t id) : Node(id) {}
+  virtual ~FeatureNode() {}
+  virtual int get_size(bool need_feature);
+  virtual void to_buffer(char *buffer, bool need_feature);
+  virtual void recover_from_buffer(char *buffer);
+  virtual std::string get_feature(int idx) {
+    if (idx < (int)this->feature.size()) {
+      return this->feature[idx];
+    } else {
+      return std::string("");
+    }
+  }
+
+  virtual void set_feature(int idx, std::string str) {
+    if (idx >= (int)this->feature.size()) {
+      this->feature.resize(idx + 1);
+    }
+    this->feature[idx] = str;
+  }
+  virtual void set_feature_size(int size) { this->feature.resize(size); }
+  virtual int get_feature_size() { return this->feature.size(); }
+
+  template <typename T>
+  static std::string parse_value_to_bytes(std::vector<std::string> feat_str) {
+    T v;
+    size_t Tsize = sizeof(T) * feat_str.size();
+    char buffer[Tsize];
+    for (size_t i = 0; i < feat_str.size(); i++) {
+      std::stringstream ss(feat_str[i]);
+      ss >> v;
+      std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T));
+    }
+    return std::string(buffer, Tsize);
+  }
+
+  template <typename T>
+  static std::vector<T> parse_bytes_to_array(std::string feat_str) {
+    T v;
+    std::vector<T> out;
+    size_t start = 0;
+    const char *buffer = feat_str.data();
+    while (start < feat_str.size()) {
+      std::memcpy((char *)&v, buffer + start, sizeof(T));
+      start += sizeof(T);
+      out.push_back(v);
+    }
+    return out;
+  }
+
+ protected:
+  std::vector<std::string> feature;
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc
new file mode 100644
index 00000000000000..3a680875e3df4a
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h"
+#include <iostream>
+#include <unordered_map>
+namespace paddle {
+namespace distributed {
+
+void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; }
+
+std::vector<int> RandomSampler::sample_k(int k) {
+  int n = edges->size();
+  if (k > n) {
+    k = n;
+  }
+  struct timespec tn;
+  clock_gettime(CLOCK_REALTIME, &tn);
+  srand(tn.tv_nsec);
+  std::vector<int> sample_result;
+  std::unordered_map<int, int> replace_map;
+  while (k--) {
+    int rand_int = rand() % n;
+    auto iter = replace_map.find(rand_int);
+    if (iter == replace_map.end()) {
+      sample_result.push_back(rand_int);
+    } else {
+      sample_result.push_back(iter->second);
+    }
+
+    iter = replace_map.find(n - 1);
+    if (iter == replace_map.end()) {
+      replace_map[rand_int] = n - 1;
+    } else {
+      replace_map[rand_int] = iter->second;
+    }
+    --n;
+  }
+  return sample_result;
+}
+
+WeightedSampler::WeightedSampler() {
+  left = nullptr;
+  right = nullptr;
+  edges = nullptr;
+}
+
+WeightedSampler::~WeightedSampler() {
+  if (left != nullptr) {
+    delete left;
+    left = nullptr;
+  }
+  if (right != nullptr) {
+    delete right;
+    right = nullptr;
+  }
+}
+
+void WeightedSampler::build(GraphEdgeBlob *edges) {
+  if (left != nullptr) {
+    delete left;
+    left = nullptr;
+  }
+  if (right != nullptr) {
+    delete right;
+    right = nullptr;
+  }
+  return build_one((WeightedGraphEdgeBlob *)edges, 0, edges->size());
+}
+
+void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges, int start,
+                                int end) {
+  count = 0;
+  this->edges = edges;
+  if (start + 1 == end) {
+    left = right = nullptr;
+    idx = start;
+    count = 1;
+    weight = edges->get_weight(idx);
+
+  } else {
+    left = new WeightedSampler();
+    right = new WeightedSampler();
+    left->build_one(edges, start, start + (end - start) / 2);
+    right->build_one(edges, start + (end - start) / 2, end);
+    weight = left->weight + right->weight;
+    count = left->count + right->count;
+  }
+}
+std::vector<int> WeightedSampler::sample_k(int k) {
+  if (k > count) {
+    k = count;
+  }
+  std::vector<int> sample_result;
+  float subtract;
+  std::unordered_map<WeightedSampler *, float> subtract_weight_map;
+  std::unordered_map<WeightedSampler *, int> subtract_count_map;
+  struct timespec tn;
+  clock_gettime(CLOCK_REALTIME, &tn);
+  srand(tn.tv_nsec);
+  while (k--) {
+    float query_weight = rand() % 100000 / 100000.0;
+    query_weight *= weight - subtract_weight_map[this];
+    sample_result.push_back(sample(query_weight, subtract_weight_map,
+                                   subtract_count_map, subtract));
+  }
+  return sample_result;
+}
+
+int WeightedSampler::sample(
+    float query_weight,
+    std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
+    std::unordered_map<WeightedSampler *, int> &subtract_count_map,
+    float &subtract) {
+  if (left == nullptr) {
+    subtract_weight_map[this] = weight;
+    subtract = weight;
+    subtract_count_map[this] = 1;
+    return idx;
+  }
+  int left_count = left->count - subtract_count_map[left];
+  int right_count = right->count - subtract_count_map[right];
+  float left_subtract = subtract_weight_map[left];
+  int return_idx;
+  if (right_count == 0 ||
+      left_count > 0 && left->weight - left_subtract >= query_weight) {
+    return_idx = left->sample(query_weight, subtract_weight_map,
+                              subtract_count_map, subtract);
+  } else {
+    return_idx =
+        right->sample(query_weight - (left->weight - left_subtract),
+                      subtract_weight_map, subtract_count_map, subtract);
+  }
+  subtract_weight_map[this] += subtract;
+  subtract_count_map[this]++;
+  return return_idx;
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h
new file mode 100644
index 00000000000000..1787ab23b04316
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <ctime>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/distributed/table/graph/graph_edge.h"
+namespace paddle {
+namespace distributed {
+
+class Sampler {
+ public:
+  virtual ~Sampler() {}
+  virtual void build(GraphEdgeBlob *edges) = 0;
+  virtual std::vector<int> sample_k(int k) = 0;
+};
+
+class RandomSampler : public Sampler {
+ public:
+  virtual ~RandomSampler() {}
+  virtual void build(GraphEdgeBlob *edges);
+  virtual std::vector<int> sample_k(int k);
+  GraphEdgeBlob *edges;
+};
+
+class WeightedSampler : public Sampler {
+ public:
+  WeightedSampler();
+  virtual ~WeightedSampler();
+  WeightedSampler *left, *right;
+  float weight;
+  int count;
+  int idx;
+  GraphEdgeBlob *edges;
+  virtual void build(GraphEdgeBlob *edges);
+  virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end);
+  virtual std::vector<int> sample_k(int k);
+
+ private:
+  int sample(float query_weight,
+             std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
+             std::unordered_map<WeightedSampler *, int> &subtract_count_map,
+             float &subtract);
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_edge.cc b/paddle/fluid/distributed/table/graph_edge.cc
new file mode 100644
index 00000000000000..cc90f4c6516c18
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_edge.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/graph_edge.h"
+#include <cstring>
+namespace paddle {
+namespace distributed {
+
+void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+  id_arr.push_back(id);
+}
+
+void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) {
+  id_arr.push_back(id);
+  weight_arr.push_back(weight);
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_edge.h b/paddle/fluid/distributed/table/graph_edge.h
new file mode 100644
index 00000000000000..3dfe5a6f357a7c
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_edge.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+namespace paddle {
+namespace distributed {
+
+class GraphEdgeBlob {
+ public:
+  GraphEdgeBlob() {}
+  virtual ~GraphEdgeBlob() {}
+  size_t size() { return id_arr.size(); }
+  virtual void add_edge(uint64_t id, float weight);
+  uint64_t get_id(int idx) { return id_arr[idx]; }
+  virtual float get_weight(int idx) { return 1; }
+
+ protected:
+  std::vector<uint64_t> id_arr;
+};
+
+class WeightedGraphEdgeBlob : public GraphEdgeBlob {
+ public:
+  WeightedGraphEdgeBlob() {}
+  virtual ~WeightedGraphEdgeBlob() {}
+  virtual void add_edge(uint64_t id, float weight);
+  virtual float get_weight(int idx) { return weight_arr[idx]; }
+
+ protected:
+  std::vector<float> weight_arr;
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_node.cc b/paddle/fluid/distributed/table/graph_node.cc
new file mode 100644
index 00000000000000..27a2cafaf4f0fe
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_node.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/graph_node.h"
+#include <cstring>
+namespace paddle {
+namespace distributed {
+
+GraphNode::~GraphNode() {
+  if (sampler != nullptr) {
+    delete sampler;
+    sampler = nullptr;
+  }
+  if (edges != nullptr) {
+    delete edges;
+    edges = nullptr;
+  }
+}
+
+int Node::weight_size = sizeof(float);
+int Node::id_size = sizeof(uint64_t);
+int Node::int_size = sizeof(int);
+
+int Node::get_size(bool need_feature) { return id_size + int_size; }
+
+void Node::to_buffer(char* buffer, bool need_feature) {
+  memcpy(buffer, &id, id_size);
+  buffer += id_size;
+
+  int feat_num = 0;
+  memcpy(buffer, &feat_num, sizeof(int));
+}
+
+void Node::recover_from_buffer(char* buffer) { memcpy(&id, buffer, id_size); }
+
+int FeatureNode::get_size(bool need_feature) {
+  int size = id_size + int_size;  // id, feat_num
+  if (need_feature) {
+    size += feature.size() * int_size;
+    for (const std::string& fea : feature) {
+      size += fea.size();
+    }
+  }
+  return size;
+}
+
+void GraphNode::build_edges(bool is_weighted) {
+  if (edges == nullptr) {
+    if (is_weighted == true) {
+      edges = new WeightedGraphEdgeBlob();
+    } else {
+      edges = new GraphEdgeBlob();
+    }
+  }
+}
+void GraphNode::build_sampler(std::string sample_type) {
+  if (sample_type == "random") {
+    sampler = new RandomSampler();
+  } else if (sample_type == "weighted") {
+    sampler = new WeightedSampler();
+  }
+  sampler->build(edges);
+}
+void FeatureNode::to_buffer(char* buffer, bool need_feature) {
+  memcpy(buffer, &id, id_size);
+  buffer += id_size;
+
+  int feat_num = 0;
+  int feat_len;
+  if (need_feature) {
+    feat_num += feature.size();
+    memcpy(buffer, &feat_num, sizeof(int));
+    buffer += sizeof(int);
+    for (int i = 0; i < feat_num; ++i) {
+      feat_len = feature[i].size();
+      memcpy(buffer, &feat_len, sizeof(int));
+      buffer += sizeof(int);
+      memcpy(buffer, feature[i].c_str(), feature[i].size());
+      buffer += feature[i].size();
+    }
+  } else {
+    memcpy(buffer, &feat_num, sizeof(int));
+  }
+}
+void FeatureNode::recover_from_buffer(char* buffer) {
+  int feat_num, feat_len;
+  memcpy(&id, buffer, id_size);
+  buffer += id_size;
+
+  memcpy(&feat_num, buffer, sizeof(int));
+  buffer += sizeof(int);
+
+  feature.clear();
+  for (int i = 0; i < feat_num; ++i) {
+    memcpy(&feat_len, buffer, sizeof(int));
+    buffer += sizeof(int);
+
+    char str[feat_len + 1];
+    memcpy(str, buffer, feat_len);
+    buffer += feat_len;
+    str[feat_len] = '\0';
+    feature.push_back(std::string(str));
+  }
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_node.h b/paddle/fluid/distributed/table/graph_node.h
new file mode 100644
index 00000000000000..c3e8e3ce5b50d0
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_node.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include "paddle/fluid/distributed/table/graph_weighted_sampler.h"
+namespace paddle {
+namespace distributed {
+
+class Node {
+ public:
+  Node() {}
+  Node(uint64_t id) : id(id) {}
+  virtual ~Node() {}
+  static int id_size, int_size, weight_size;
+  uint64_t get_id() { return id; }
+  void set_id(uint64_t id) { this->id = id; }
+
+  virtual void build_edges(bool is_weighted) {}
+  virtual void build_sampler(std::string sample_type) {}
+  virtual void add_edge(uint64_t id, float weight) {}
+  virtual std::vector<int> sample_k(int k) { return std::vector<int>(); }
+  virtual uint64_t get_neighbor_id(int idx) { return 0; }
+  virtual float get_neighbor_weight(int idx) { return 1.; }
+
+  virtual int get_size(bool need_feature);
+  virtual void to_buffer(char *buffer, bool need_feature);
+  virtual void recover_from_buffer(char *buffer);
+  virtual std::string get_feature(int idx) { return std::string(""); }
+  virtual void set_feature(int idx, std::string str) {}
+  virtual void set_feature_size(int size) {}
+  virtual int get_feature_size() { return 0; }
+
+ protected:
+  uint64_t id;
+};
+
+class GraphNode : public Node {
+ public:
+  GraphNode() : Node(), sampler(nullptr), edges(nullptr) {}
+  GraphNode(uint64_t id) : Node(id), sampler(nullptr), edges(nullptr) {}
+  virtual ~GraphNode();
+  virtual void build_edges(bool is_weighted);
+  virtual void build_sampler(std::string sample_type);
+  virtual void add_edge(uint64_t id, float weight) {
+    edges->add_edge(id, weight);
+  }
+  virtual std::vector<int> sample_k(int k) { return sampler->sample_k(k); }
+  virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); }
+  virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); }
+
+ protected:
+  Sampler *sampler;
+  GraphEdgeBlob *edges;
+};
+
+class FeatureNode : public Node {
+ public:
+  FeatureNode() : Node() {}
+  FeatureNode(uint64_t id) : Node(id) {}
+  virtual ~FeatureNode() {}
+  virtual int get_size(bool need_feature);
+  virtual void to_buffer(char *buffer, bool need_feature);
+  virtual void recover_from_buffer(char *buffer);
+  virtual std::string get_feature(int idx) {
+    if (idx < (int)this->feature.size()) {
+      return this->feature[idx];
+    } else {
+      return std::string("");
+    }
+  }
+
+  virtual void set_feature(int idx, std::string str) {
+    if (idx >= (int)this->feature.size()) {
+      this->feature.resize(idx + 1);
+    }
+    this->feature[idx] = str;
+  }
+  virtual void set_feature_size(int size) { this->feature.resize(size); }
+  virtual int get_feature_size() { return this->feature.size(); }
+
+  template <typename T>
+  static std::string parse_value_to_bytes(std::vector<std::string> feat_str) {
+    T v;
+    size_t Tsize = sizeof(T) * feat_str.size();
+    char buffer[Tsize];
+    for (size_t i = 0; i < feat_str.size(); i++) {
+      std::stringstream ss(feat_str[i]);
+      ss >> v;
+      std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T));
+    }
+    return std::string(buffer, Tsize);
+  }
+
+  template <typename T>
+  static std::vector<T> parse_bytes_to_array(std::string feat_str) {
+    T v;
+    std::vector<T> out;
+    size_t start = 0;
+    const char *buffer = feat_str.data();
+    while (start < feat_str.size()) {
+      std::memcpy((char *)&v, buffer + start, sizeof(T));
+      start += sizeof(T);
+      out.push_back(v);
+    }
+    return out;
+  }
+
+ protected:
+  std::vector<std::string> feature;
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.cc b/paddle/fluid/distributed/table/graph_weighted_sampler.cc
new file mode 100644
index 00000000000000..059a1d64bc392d
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_weighted_sampler.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/table/graph_weighted_sampler.h"
+#include <iostream>
+#include <unordered_map>
+namespace paddle {
+namespace distributed {
+
+void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; }
+
+std::vector<int> RandomSampler::sample_k(int k) {
+  int n = edges->size();
+  if (k > n) {
+    k = n;
+  }
+  struct timespec tn;
+  clock_gettime(CLOCK_REALTIME, &tn);
+  srand(tn.tv_nsec);
+  std::vector<int> sample_result;
+  std::unordered_map<int, int> replace_map;
+  while (k--) {
+    int rand_int = rand() % n;
+    auto iter = replace_map.find(rand_int);
+    if (iter == replace_map.end()) {
+      sample_result.push_back(rand_int);
+    } else {
+      sample_result.push_back(iter->second);
+    }
+
+    iter = replace_map.find(n - 1);
+    if (iter == replace_map.end()) {
+      replace_map[rand_int] = n - 1;
+    } else {
+      replace_map[rand_int] = iter->second;
+    }
+    --n;
+  }
+  return sample_result;
+}
+
+WeightedSampler::WeightedSampler() {
+  left = nullptr;
+  right = nullptr;
+  edges = nullptr;
+}
+
+WeightedSampler::~WeightedSampler() {
+  if (left != nullptr) {
+    delete left;
+    left = nullptr;
+  }
+  if (right != nullptr) {
+    delete right;
+    right = nullptr;
+  }
+}
+
+void WeightedSampler::build(GraphEdgeBlob *edges) {
+  if (left != nullptr) {
+    delete left;
+    left = nullptr;
+  }
+  if (right != nullptr) {
+    delete right;
+    right = nullptr;
+  }
+  return build_one((WeightedGraphEdgeBlob *)edges, 0, edges->size());
+}
+
+void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges, int start,
+                                int end) {
+  count = 0;
+  this->edges = edges;
+  if (start + 1 == end) {
+    left = right = nullptr;
+    idx = start;
+    count = 1;
+    weight = edges->get_weight(idx);
+
+  } else {
+    left = new WeightedSampler();
+    right = new WeightedSampler();
+    left->build_one(edges, start, start + (end - start) / 2);
+    right->build_one(edges, start + (end - start) / 2, end);
+    weight = left->weight + right->weight;
+    count = left->count + right->count;
+  }
+}
+std::vector<int> WeightedSampler::sample_k(int k) {
+  if (k > count) {
+    k = count;
+  }
+  std::vector<int> sample_result;
+  float subtract;
+  std::unordered_map<WeightedSampler *, float> subtract_weight_map;
+  std::unordered_map<WeightedSampler *, int> subtract_count_map;
+  struct timespec tn;
+  clock_gettime(CLOCK_REALTIME, &tn);
+  srand(tn.tv_nsec);
+  while (k--) {
+    float query_weight = rand() % 100000 / 100000.0;
+    query_weight *= weight - subtract_weight_map[this];
+    sample_result.push_back(sample(query_weight, subtract_weight_map,
+                                   subtract_count_map, subtract));
+  }
+  return sample_result;
+}
+
+int WeightedSampler::sample(
+    float query_weight,
+    std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
+    std::unordered_map<WeightedSampler *, int> &subtract_count_map,
+    float &subtract) {
+  if (left == nullptr) {
+    subtract_weight_map[this] = weight;
+    subtract = weight;
+    subtract_count_map[this] = 1;
+    return idx;
+  }
+  int left_count = left->count - subtract_count_map[left];
+  int right_count = right->count - subtract_count_map[right];
+  float left_subtract = subtract_weight_map[left];
+  int return_idx;
+  if (right_count == 0 ||
+      left_count > 0 && left->weight - left_subtract >= query_weight) {
+    return_idx = left->sample(query_weight, subtract_weight_map,
+                              subtract_count_map, subtract);
+  } else {
+    return_idx =
+        right->sample(query_weight - (left->weight - left_subtract),
+                      subtract_weight_map, subtract_count_map, subtract);
+  }
+  subtract_weight_map[this] += subtract;
+  subtract_count_map[this]++;
+  return return_idx;
+}
+}
+}
diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.h b/paddle/fluid/distributed/table/graph_weighted_sampler.h
new file mode 100644
index 00000000000000..cfc341d27c6b76
--- /dev/null
+++ b/paddle/fluid/distributed/table/graph_weighted_sampler.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <ctime>
+#include <unordered_map>
+#include <vector>
+#include "paddle/fluid/distributed/table/graph_edge.h"
+namespace paddle {
+namespace distributed {
+
+class Sampler {
+ public:
+  virtual ~Sampler() {}
+  virtual void build(GraphEdgeBlob *edges) = 0;
+  virtual std::vector<int> sample_k(int k) = 0;
+};
+
+class RandomSampler : public Sampler {
+ public:
+  virtual ~RandomSampler() {}
+  virtual void build(GraphEdgeBlob *edges);
+  virtual std::vector<int> sample_k(int k);
+  GraphEdgeBlob *edges;
+};
+
+class WeightedSampler : public Sampler {
+ public:
+  WeightedSampler();
+  virtual ~WeightedSampler();
+  WeightedSampler *left, *right;
+  float weight;
+  int count;
+  int idx;
+  GraphEdgeBlob *edges;
+  virtual void build(GraphEdgeBlob *edges);
+  virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end);
+  virtual std::vector<int> sample_k(int k);
+
+ private:
+  int sample(float query_weight,
+             std::unordered_map<WeightedSampler *, float> &subtract_weight_map,
+             std::unordered_map<WeightedSampler *, int> &subtract_count_map,
+             float &subtract);
+};
+}
+}
diff --git a/paddle/fluid/distributed/table/sparse_geo_table.cc b/paddle/fluid/distributed/table/sparse_geo_table.cc
index 9b276e7de5c92d..04cd1136382a4e 100644
--- a/paddle/fluid/distributed/table/sparse_geo_table.cc
+++ b/paddle/fluid/distributed/table/sparse_geo_table.cc
@@ -22,8 +22,17 @@ int32_t SparseGeoTable::pull_geo_param(const uint32_t trainer_id,
                                        std::vector<uint64_t>* ids) {
   geo_recorder->GetAndClear(trainer_id, ids);
   auto dim = _config.common().dims()[0];
+
+  std::vector<uint32_t> frequencies;
+  frequencies.resize(ids->size(), 1);
+
+  auto pull_value = PullSparseValue(ids->size(), dim);
+  pull_value.is_training_ = true;
+  pull_value.feasigns_ = ids->data();
+  pull_value.frequencies_ = frequencies.data();
+
   values->resize(ids->size() * dim);
-  CommonSparseTable::pull_sparse(values->data(), ids->data(), ids->size());
+  CommonSparseTable::pull_sparse(values->data(), pull_value);
   return 0;
 }
 
diff --git a/paddle/fluid/distributed/table/table.cc b/paddle/fluid/distributed/table/table.cc
index dfaaa6ffc12c2b..600be954cb5966 100644
--- a/paddle/fluid/distributed/table/table.cc
+++ b/paddle/fluid/distributed/table/table.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/distributed/common/registerer.h"
 
 #include "paddle/fluid/distributed/table/common_dense_table.h"
+#include "paddle/fluid/distributed/table/common_graph_table.h"
 #include "paddle/fluid/distributed/table/common_sparse_table.h"
 #include "paddle/fluid/distributed/table/sparse_geo_table.h"
 #include "paddle/fluid/distributed/table/tensor_accessor.h"
@@ -25,7 +26,7 @@
 
 namespace paddle {
 namespace distributed {
-
+REGISTER_PSCORE_CLASS(Table, GraphTable);
 REGISTER_PSCORE_CLASS(Table, CommonDenseTable);
 REGISTER_PSCORE_CLASS(Table, CommonSparseTable);
 REGISTER_PSCORE_CLASS(Table, SparseGeoTable);
@@ -75,5 +76,6 @@ int32_t Table::initialize_accessor() {
   _value_accesor.reset(accessor);
   return 0;
 }
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/table/table.h
index 65c99d2bbd40d4..5bc818ff4741fd 100644
--- a/paddle/fluid/distributed/table/table.h
+++ b/paddle/fluid/distributed/table/table.h
@@ -21,6 +21,8 @@
 #include <string>
 #include <utility>
 #include "paddle/fluid/distributed/table/accessor.h"
+#include "paddle/fluid/distributed/table/depends/sparse_utils.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -46,8 +48,8 @@ class Table {
     return 0;
   }
 
-  virtual int32_t pull_sparse(float *values, const uint64_t *keys,
-                              size_t num) = 0;
+  virtual int32_t pull_sparse(float *values,
+                              const PullSparseValue &pull_value) = 0;
   virtual int32_t push_sparse(const uint64_t *keys, const float *values,
                               size_t num) = 0;
   virtual int32_t push_sparse_param(const uint64_t *keys, const float *values,
@@ -141,5 +143,6 @@ class TableManager {
   TableManager() {}
   ~TableManager() {}
 };
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/table/tensor_table.h b/paddle/fluid/distributed/table/tensor_table.h
index 1a8f1a9cd9adb8..080682d131420b 100644
--- a/paddle/fluid/distributed/table/tensor_table.h
+++ b/paddle/fluid/distributed/table/tensor_table.h
@@ -52,8 +52,8 @@ class TensorTable : public Table {
 
   int32_t push_dense(const float *values, size_t num) override { return 0; }
 
-  int32_t pull_sparse(float *values, const uint64_t *keys,
-                      size_t num) override {
+  int32_t pull_sparse(float *values,
+                      const PullSparseValue &pull_value) override {
     return 0;
   }
   int32_t push_sparse(const uint64_t *keys, const float *values,
@@ -102,8 +102,8 @@ class DenseTensorTable : public TensorTable {
   DenseTensorTable() {}
   virtual ~DenseTensorTable() {}
 
-  int32_t pull_sparse(float *values, const uint64_t *keys,
-                      size_t num) override {
+  int32_t pull_sparse(float *values,
+                      const PullSparseValue &pull_value) override {
     return 0;
   }
   int32_t push_sparse(const uint64_t *keys, const float *values,
@@ -158,8 +158,8 @@ class GlobalStepTable : public DenseTensorTable {
   GlobalStepTable() {}
   virtual ~GlobalStepTable() {}
 
-  int32_t pull_sparse(float *values, const uint64_t *keys,
-                      size_t num) override {
+  int32_t pull_sparse(float *values,
+                      const PullSparseValue &pull_value) override {
     return 0;
   }
   int32_t push_sparse(const uint64_t *keys, const float *values,
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index adedd049023daa..b756c740ac764c 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -15,3 +15,6 @@ cc_test(brpc_service_sparse_sgd_test SRCS brpc_service_sparse_sgd_test.cc DEPS s
 
 set_source_files_properties(brpc_utils_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(brpc_utils_test SRCS brpc_utils_test.cc DEPS brpc_utils scope math_function ${COMMON_DEPS} ${RPC_DEPS})
+
+set_source_files_properties(graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS})
diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
index fbd236012f5237..8fb3434af6e281 100644
--- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
@@ -212,8 +212,8 @@ void RunBrpcPushSparse() {
 
   /*-----------------------Test Server Init----------------------------------*/
   LOG(INFO) << "Run pull_sparse_param";
-  auto pull_status = worker_ptr_->pull_sparse(fea_value_ptr.data(), 0,
-                                              fea_keys.data(), fea_keys.size());
+  auto pull_status = worker_ptr_->pull_sparse(
+      fea_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true);
   pull_status.wait();
   for (size_t idx = 0; idx < tensor->numel(); ++idx) {
     fea_values.data()[idx] *= 2.0;
@@ -241,7 +241,7 @@ void RunBrpcPushSparse() {
   push_status.wait();
 
   auto pull_param_status = worker_ptr_->pull_sparse(
-      fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size());
+      fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true);
   pull_param_status.wait();
 
   for (size_t idx = 0; idx < tensor->numel(); ++idx) {
@@ -275,7 +275,7 @@ void RunBrpcPushSparse() {
   push_grad_status.wait();
 
   auto pull_update_status = worker_ptr_->pull_sparse(
-      fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size());
+      fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true);
   pull_update_status.wait();
 
   for (size_t idx = 0; idx < tensor->numel(); ++idx) {
diff --git a/paddle/fluid/distributed/test/geo_table_test.cc b/paddle/fluid/distributed/test/geo_table_test.cc
index 22e11acf6584ee..c9f15db3f788e1 100644
--- a/paddle/fluid/distributed/test/geo_table_test.cc
+++ b/paddle/fluid/distributed/test/geo_table_test.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps.pb.h"
 #include "paddle/fluid/distributed/table/common_dense_table.h"
 #include "paddle/fluid/distributed/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/table/depends/sparse_utils.h"
 #include "paddle/fluid/distributed/table/sparse_geo_table.h"
 #include "paddle/fluid/distributed/table/table.h"
 
@@ -53,14 +54,18 @@ TEST(SparseGeoTable, SSUM) {
 
   // test push_sparse_param, and create params
   std::vector<uint64_t> init_keys = {0, 1, 2, 3, 4};
+  std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1};
   std::vector<float> init_values;
   for (size_t i = 0; i < init_keys.size() * emb_dim; i++) {
     init_values.push_back(0.0);
   }
   table->push_sparse_param(init_keys.data(), init_values.data(),
                            init_keys.size());
+
   std::vector<float> pull_values(init_values.size());
-  table->pull_sparse(pull_values.data(), init_keys.data(), init_keys.size());
+  auto value = PullSparseValue(init_keys, init_fres, emb_dim);
+  table->pull_sparse(pull_values.data(), value);
+
   for (size_t i = 0; i < init_keys.size() * emb_dim; i++) {
     ASSERT_TRUE(abs(pull_values[i] - init_values[i]) < 1e-5);
   }
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
new file mode 100644
index 00000000000000..b268bb449e1461
--- /dev/null
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -0,0 +1,556 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <condition_variable>  // NOLINT
+#include <fstream>
+#include <iomanip>
+#include <string>
+#include <thread>  // NOLINT
+#include <unordered_set>
+#include <vector>
+#include "google/protobuf/text_format.h"
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/distributed/ps.pb.h"
+#include "paddle/fluid/distributed/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/service/brpc_ps_server.h"
+#include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/service/graph_brpc_server.h"
+#include "paddle/fluid/distributed/service/graph_py_service.h"
+#include "paddle/fluid/distributed/service/ps_client.h"
+#include "paddle/fluid/distributed/service/sendrecv.pb.h"
+#include "paddle/fluid/distributed/service/service.h"
+#include "paddle/fluid/distributed/table/graph/graph_node.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace math = paddle::operators::math;
+namespace memory = paddle::memory;
+namespace distributed = paddle::distributed;
+
+void testSampleNodes(
+    std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
+  std::vector<uint64_t> ids;
+  auto pull_status = worker_ptr_->random_sample_nodes(0, 0, 6, ids);
+  std::unordered_set<uint64_t> s;
+  std::unordered_set<uint64_t> s1 = {37, 59};
+  pull_status.wait();
+  for (auto id : ids) s.insert(id);
+  ASSERT_EQ(true, s.size() == s1.size());
+  for (auto id : s) {
+    ASSERT_EQ(true, s1.find(id) != s1.end());
+  }
+}
+
+void testFeatureNodeSerializeInt() {
+  std::string out =
+      distributed::FeatureNode::parse_value_to_bytes<int32_t>({"123", "345"});
+  std::vector<int32_t> out2 =
+      distributed::FeatureNode::parse_bytes_to_array<int32_t>(out);
+  ASSERT_EQ(out2[0], 123);
+  ASSERT_EQ(out2[1], 345);
+}
+
+void testFeatureNodeSerializeInt64() {
+  std::string out =
+      distributed::FeatureNode::parse_value_to_bytes<int64_t>({"123", "345"});
+  std::vector<int64_t> out2 =
+      distributed::FeatureNode::parse_bytes_to_array<int64_t>(out);
+  ASSERT_EQ(out2[0], 123);
+  ASSERT_EQ(out2[1], 345);
+}
+
+void testFeatureNodeSerializeFloat32() {
+  std::string out = distributed::FeatureNode::parse_value_to_bytes<float>(
+      {"123.123", "345.123"});
+  std::vector<float> out2 =
+      distributed::FeatureNode::parse_bytes_to_array<float>(out);
+  float eps;
+  std::cout << "Float " << out2[0] << " " << 123.123 << std::endl;
+  eps = out2[0] - 123.123;
+  ASSERT_LE(eps * eps, 1e-5);
+  eps = out2[1] - 345.123;
+  ASSERT_LE(eps * eps, 1e-5);
+}
+
+void testFeatureNodeSerializeFloat64() {
+  std::string out = distributed::FeatureNode::parse_value_to_bytes<double>(
+      {"123.123", "345.123"});
+  std::vector<double> out2 =
+      distributed::FeatureNode::parse_bytes_to_array<double>(out);
+  float eps;
+  eps = out2[0] - 123.123;
+  std::cout << "Float64 " << out2[0] << " " << 123.123 << std::endl;
+  ASSERT_LE(eps * eps, 1e-5);
+  eps = out2[1] - 345.123;
+  ASSERT_LE(eps * eps, 1e-5);
+}
+
+void testSingleSampleNeighboor(
+    std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
+  std::vector<std::vector<std::pair<uint64_t, float>>> vs;
+  auto pull_status = worker_ptr_->batch_sample_neighboors(
+      0, std::vector<uint64_t>(1, 37), 4, vs);
+  pull_status.wait();
+
+  std::unordered_set<uint64_t> s;
+  std::unordered_set<uint64_t> s1 = {112, 45, 145};
+  for (auto g : vs[0]) {
+    s.insert(g.first);
+  }
+  ASSERT_EQ(s.size(), 3);
+  for (auto g : s) {
+    ASSERT_EQ(true, s1.find(g) != s1.end());
+  }
+  VLOG(0) << "test single done";
+  s.clear();
+  s1.clear();
+  vs.clear();
+  pull_status = worker_ptr_->batch_sample_neighboors(
+      0, std::vector<uint64_t>(1, 96), 4, vs);
+  pull_status.wait();
+  s1 = {111, 48, 247};
+  for (auto g : vs[0]) {
+    s.insert(g.first);
+  }
+  ASSERT_EQ(s.size(), 3);
+  for (auto g : s) {
+    ASSERT_EQ(true, s1.find(g) != s1.end());
+  }
+}
+
+void testBatchSampleNeighboor(
+    std::shared_ptr<paddle::distributed::GraphBrpcClient>& worker_ptr_) {
+  std::vector<std::vector<std::pair<uint64_t, float>>> vs;
+  std::vector<std::uint64_t> v = {37, 96};
+  auto pull_status = worker_ptr_->batch_sample_neighboors(0, v, 4, vs);
+  pull_status.wait();
+  std::unordered_set<uint64_t> s;
+  std::unordered_set<uint64_t> s1 = {112, 45, 145};
+  for (auto g : vs[0]) {
+    s.insert(g.first);
+  }
+  ASSERT_EQ(s.size(), 3);
+  for (auto g : s) {
+    ASSERT_EQ(true, s1.find(g) != s1.end());
+  }
+  s.clear();
+  s1.clear();
+  s1 = {111, 48, 247};
+  for (auto g : vs[1]) {
+    s.insert(g.first);
+  }
+  ASSERT_EQ(s.size(), 3);
+  for (auto g : s) {
+    ASSERT_EQ(true, s1.find(g) != s1.end());
+  }
+}
+
+void testGraphToBuffer();
+// std::string nodes[] = {std::string("37\taa\t45;0.34\t145;0.31\t112;0.21"),
+//                        std::string("96\tfeature\t48;1.4\t247;0.31\t111;1.21"),
+//                        std::string("59\ttreat\t45;0.34\t145;0.31\t112;0.21"),
+//                        std::string("97\tfood\t48;1.4\t247;0.31\t111;1.21")};
+
+std::string edges[] = {
+    std::string("37\t45\t0.34"),  std::string("37\t145\t0.31"),
+    std::string("37\t112\t0.21"), std::string("96\t48\t1.4"),
+    std::string("96\t247\t0.31"), std::string("96\t111\t1.21"),
+    std::string("59\t45\t0.34"),  std::string("59\t145\t0.31"),
+    std::string("59\t122\t0.21"), std::string("97\t48\t0.34"),
+    std::string("97\t247\t0.31"), std::string("97\t111\t0.21")};
+char edge_file_name[] = "edges.txt";
+
+std::string nodes[] = {
+    std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"),
+    std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"),
+    std::string("user\t59\ta 0.11\tb 11 14"),
+    std::string("user\t97\ta 0.11\tb 12 11"),
+    std::string("item\t45\ta 0.21"),
+    std::string("item\t145\ta 0.21"),
+    std::string("item\t112\ta 0.21"),
+    std::string("item\t48\ta 0.21"),
+    std::string("item\t247\ta 0.21"),
+    std::string("item\t111\ta 0.21"),
+    std::string("item\t46\ta 0.21"),
+    std::string("item\t146\ta 0.21"),
+    std::string("item\t122\ta 0.21"),
+    std::string("item\t49\ta 0.21"),
+    std::string("item\t248\ta 0.21"),
+    std::string("item\t113\ta 0.21")};
+char node_file_name[] = "nodes.txt";
+
+void prepare_file(char file_name[], bool load_edge) {
+  std::ofstream ofile;
+  ofile.open(file_name);
+  if (load_edge) {
+    for (auto x : edges) {
+      ofile << x << std::endl;
+    }
+  } else {
+    for (auto x : nodes) {
+      ofile << x << std::endl;
+    }
+  }
+  ofile.close();
+}
+void GetDownpourSparseTableProto(
+    ::paddle::distributed::TableParameter* sparse_table_proto) {
+  sparse_table_proto->set_table_id(0);
+  sparse_table_proto->set_table_class("GraphTable");
+  sparse_table_proto->set_shard_num(127);
+  sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE);
+  ::paddle::distributed::TableAccessorParameter* accessor_proto =
+      sparse_table_proto->mutable_accessor();
+  accessor_proto->set_accessor_class("CommMergeAccessor");
+}
+
+::paddle::distributed::PSParameter GetServerProto() {
+  // Generate server proto desc
+  ::paddle::distributed::PSParameter server_fleet_desc;
+  ::paddle::distributed::ServerParameter* server_proto =
+      server_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  ::paddle::distributed::TableParameter* sparse_table_proto =
+      downpour_server_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(sparse_table_proto);
+  return server_fleet_desc;
+}
+
+::paddle::distributed::PSParameter GetWorkerProto() {
+  ::paddle::distributed::PSParameter worker_fleet_desc;
+  ::paddle::distributed::WorkerParameter* worker_proto =
+      worker_fleet_desc.mutable_worker_param();
+
+  ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto =
+      worker_proto->mutable_downpour_worker_param();
+
+  ::paddle::distributed::TableParameter* worker_sparse_table_proto =
+      downpour_worker_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(worker_sparse_table_proto);
+
+  ::paddle::distributed::ServerParameter* server_proto =
+      worker_fleet_desc.mutable_server_param();
+  ::paddle::distributed::DownpourServerParameter* downpour_server_proto =
+      server_proto->mutable_downpour_server_param();
+  ::paddle::distributed::ServerServiceParameter* server_service_proto =
+      downpour_server_proto->mutable_service_param();
+  server_service_proto->set_service_class("GraphBrpcService");
+  server_service_proto->set_server_class("GraphBrpcServer");
+  server_service_proto->set_client_class("GraphBrpcClient");
+  server_service_proto->set_start_server_port(0);
+  server_service_proto->set_server_thread_num(12);
+
+  ::paddle::distributed::TableParameter* server_sparse_table_proto =
+      downpour_server_proto->add_downpour_table_param();
+  GetDownpourSparseTableProto(server_sparse_table_proto);
+
+  return worker_fleet_desc;
+}
+
+/*-------------------------------------------------------------------------*/
+
+std::string ip_ = "127.0.0.1", ip2 = "127.0.0.1";
+uint32_t port_ = 5209, port2 = 5210;
+
+std::vector<std::string> host_sign_list_;
+
+std::shared_ptr<paddle::distributed::GraphBrpcServer> pserver_ptr_,
+    pserver_ptr2;
+
+std::shared_ptr<paddle::distributed::GraphBrpcClient> worker_ptr_;
+
+void RunServer() {
+  LOG(INFO) << "init first server";
+  ::paddle::distributed::PSParameter server_proto = GetServerProto();
+
+  auto _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&host_sign_list_, 2);  // test
+  pserver_ptr_ = std::shared_ptr<paddle::distributed::GraphBrpcServer>(
+      (paddle::distributed::GraphBrpcServer*)
+          paddle::distributed::PSServerFactory::create(server_proto));
+  std::vector<framework::ProgramDesc> empty_vec;
+  framework::ProgramDesc empty_prog;
+  empty_vec.push_back(empty_prog);
+  pserver_ptr_->configure(server_proto, _ps_env, 0, empty_vec);
+  LOG(INFO) << "first server, run start(ip,port)";
+  pserver_ptr_->start(ip_, port_);
+  LOG(INFO) << "init first server Done";
+}
+
+void RunServer2() {
+  LOG(INFO) << "init second server";
+  ::paddle::distributed::PSParameter server_proto2 = GetServerProto();
+
+  auto _ps_env2 = paddle::distributed::PaddlePSEnvironment();
+  _ps_env2.set_ps_servers(&host_sign_list_, 2);  // test
+  pserver_ptr2 = std::shared_ptr<paddle::distributed::GraphBrpcServer>(
+      (paddle::distributed::GraphBrpcServer*)
+          paddle::distributed::PSServerFactory::create(server_proto2));
+  std::vector<framework::ProgramDesc> empty_vec2;
+  framework::ProgramDesc empty_prog2;
+  empty_vec2.push_back(empty_prog2);
+  pserver_ptr2->configure(server_proto2, _ps_env2, 1, empty_vec2);
+  pserver_ptr2->start(ip2, port2);
+}
+
+void RunClient(
+    std::map<uint64_t, std::vector<paddle::distributed::Region>>& dense_regions,
+    int index, paddle::distributed::PsBaseService* service) {
+  ::paddle::distributed::PSParameter worker_proto = GetWorkerProto();
+  paddle::distributed::PaddlePSEnvironment _ps_env;
+  auto servers_ = host_sign_list_.size();
+  _ps_env = paddle::distributed::PaddlePSEnvironment();
+  _ps_env.set_ps_servers(&host_sign_list_, servers_);
+  worker_ptr_ = std::shared_ptr<paddle::distributed::GraphBrpcClient>(
+      (paddle::distributed::GraphBrpcClient*)
+          paddle::distributed::PSClientFactory::create(worker_proto));
+  worker_ptr_->configure(worker_proto, dense_regions, _ps_env, 0);
+  worker_ptr_->set_shard_num(127);
+  worker_ptr_->set_local_channel(index);
+  worker_ptr_->set_local_graph_service(
+      (paddle::distributed::GraphBrpcService*)service);
+}
+
+void RunBrpcPushSparse() {
+  setenv("http_proxy", "", 1);
+  setenv("https_proxy", "", 1);
+  prepare_file(edge_file_name, 1);
+  prepare_file(node_file_name, 0);
+  auto ph_host = paddle::distributed::PSHost(ip_, port_, 0);
+  host_sign_list_.push_back(ph_host.serialize_to_string());
+
+  // test-start
+  auto ph_host2 = paddle::distributed::PSHost(ip2, port2, 1);
+  host_sign_list_.push_back(ph_host2.serialize_to_string());
+  // test-end
+  // Srart Server
+  std::thread* server_thread = new std::thread(RunServer);
+  std::thread* server_thread2 = new std::thread(RunServer2);
+  sleep(1);
+
+  std::map<uint64_t, std::vector<paddle::distributed::Region>> dense_regions;
+  dense_regions.insert(
+      std::pair<uint64_t, std::vector<paddle::distributed::Region>>(0, {}));
+  auto regions = dense_regions[0];
+
+  RunClient(dense_regions, 0, pserver_ptr_->get_service());
+
+  /*-----------------------Test Server Init----------------------------------*/
+  auto pull_status =
+      worker_ptr_->load(0, std::string(edge_file_name), std::string("e>"));
+  srand(time(0));
+  pull_status.wait();
+  std::vector<std::vector<std::pair<uint64_t, float>>> vs;
+  testSampleNodes(worker_ptr_);
+  sleep(5);
+  testSingleSampleNeighboor(worker_ptr_);
+  testBatchSampleNeighboor(worker_ptr_);
+  pull_status = worker_ptr_->batch_sample_neighboors(
+      0, std::vector<uint64_t>(1, 10240001024), 4, vs);
+  pull_status.wait();
+  ASSERT_EQ(0, vs[0].size());
+
+  std::vector<distributed::FeatureNode> nodes;
+  pull_status = worker_ptr_->pull_graph_list(0, 0, 0, 1, 1, nodes);
+  pull_status.wait();
+  ASSERT_EQ(nodes.size(), 1);
+  ASSERT_EQ(nodes[0].get_id(), 37);
+  nodes.clear();
+  pull_status = worker_ptr_->pull_graph_list(0, 0, 1, 4, 1, nodes);
+  pull_status.wait();
+  ASSERT_EQ(nodes.size(), 1);
+  ASSERT_EQ(nodes[0].get_id(), 59);
+  for (auto g : nodes) {
+    std::cout << g.get_id() << std::endl;
+  }
+  distributed::GraphPyServer server1, server2;
+  distributed::GraphPyClient client1, client2;
+  std::string ips_str = "127.0.0.1:5211;127.0.0.1:5212";
+  std::vector<std::string> edge_types = {std::string("user2item")};
+  std::vector<std::string> node_types = {std::string("user"),
+                                         std::string("item")};
+  VLOG(0) << "make 2 servers";
+  server1.set_up(ips_str, 127, node_types, edge_types, 0);
+  server2.set_up(ips_str, 127, node_types, edge_types, 1);
+
+  server1.add_table_feat_conf("user", "a", "float32", 1);
+  server1.add_table_feat_conf("user", "b", "int32", 2);
+  server1.add_table_feat_conf("user", "c", "string", 1);
+  server1.add_table_feat_conf("user", "d", "string", 1);
+  server1.add_table_feat_conf("item", "a", "float32", 1);
+
+  server2.add_table_feat_conf("user", "a", "float32", 1);
+  server2.add_table_feat_conf("user", "b", "int32", 2);
+  server2.add_table_feat_conf("user", "c", "string", 1);
+  server2.add_table_feat_conf("user", "d", "string", 1);
+  server2.add_table_feat_conf("item", "a", "float32", 1);
+
+  client1.set_up(ips_str, 127, node_types, edge_types, 0);
+
+  client1.add_table_feat_conf("user", "a", "float32", 1);
+  client1.add_table_feat_conf("user", "b", "int32", 2);
+  client1.add_table_feat_conf("user", "c", "string", 1);
+  client1.add_table_feat_conf("user", "d", "string", 1);
+  client1.add_table_feat_conf("item", "a", "float32", 1);
+
+  client2.set_up(ips_str, 127, node_types, edge_types, 1);
+
+  client2.add_table_feat_conf("user", "a", "float32", 1);
+  client2.add_table_feat_conf("user", "b", "int32", 2);
+  client2.add_table_feat_conf("user", "c", "string", 1);
+  client2.add_table_feat_conf("user", "d", "string", 1);
+  client2.add_table_feat_conf("item", "a", "float32", 1);
+
+  server1.start_server(false);
+  std::cout << "first server done" << std::endl;
+  server2.start_server(false);
+  std::cout << "second server done" << std::endl;
+  client1.start_client();
+  std::cout << "first client done" << std::endl;
+  client2.start_client();
+  std::cout << "first client done" << std::endl;
+  std::cout << "started" << std::endl;
+  VLOG(0) << "come to set local server";
+  client1.bind_local_server(0, server1);
+  VLOG(0) << "first bound";
+  client2.bind_local_server(1, server2);
+  VLOG(0) << "second bound";
+  client1.load_node_file(std::string("user"), std::string(node_file_name));
+  client1.load_node_file(std::string("item"), std::string(node_file_name));
+  client1.load_edge_file(std::string("user2item"), std::string(edge_file_name),
+                         0);
+  nodes.clear();
+
+  nodes = client1.pull_graph_list(std::string("user"), 0, 1, 4, 1);
+
+  ASSERT_EQ(nodes[0].get_id(), 59);
+  nodes.clear();
+
+  // Test Pull by step
+
+  std::unordered_set<uint64_t> count_item_nodes;
+  // pull by step 2
+  for (int test_step = 1; test_step < 4; test_step++) {
+    count_item_nodes.clear();
+    std::cout << "check pull graph list by step " << test_step << std::endl;
+    for (int server_id = 0; server_id < 2; server_id++) {
+      for (int start_step = 0; start_step < test_step; start_step++) {
+        nodes = client1.pull_graph_list(std::string("item"), server_id,
+                                        start_step, 12, test_step);
+        for (auto g : nodes) {
+          count_item_nodes.insert(g.get_id());
+        }
+        nodes.clear();
+      }
+    }
+    ASSERT_EQ(count_item_nodes.size(), 12);
+  }
+
+  vs = client1.batch_sample_neighboors(std::string("user2item"),
+                                       std::vector<uint64_t>(1, 96), 4);
+  ASSERT_EQ(vs[0].size(), 3);
+  std::vector<uint64_t> node_ids;
+  node_ids.push_back(96);
+  node_ids.push_back(37);
+  vs = client1.batch_sample_neighboors(std::string("user2item"), node_ids, 4);
+
+  ASSERT_EQ(vs.size(), 2);
+  std::vector<uint64_t> nodes_ids = client2.random_sample_nodes("user", 0, 6);
+  ASSERT_EQ(nodes_ids.size(), 2);
+  ASSERT_EQ(true, (nodes_ids[0] == 59 && nodes_ids[1] == 37) ||
+                      (nodes_ids[0] == 37 && nodes_ids[1] == 59));
+
+  // Test get node feat
+  node_ids.clear();
+  node_ids.push_back(37);
+  node_ids.push_back(96);
+  std::vector<std::string> feature_names;
+  feature_names.push_back(std::string("c"));
+  feature_names.push_back(std::string("d"));
+  auto node_feat =
+      client1.get_node_feat(std::string("user"), node_ids, feature_names);
+  ASSERT_EQ(node_feat.size(), 2);
+  ASSERT_EQ(node_feat[0].size(), 2);
+  VLOG(0) << "get_node_feat: " << node_feat[0][0];
+  VLOG(0) << "get_node_feat: " << node_feat[0][1];
+  VLOG(0) << "get_node_feat: " << node_feat[1][0];
+  VLOG(0) << "get_node_feat: " << node_feat[1][1];
+
+  // Test string
+  node_ids.clear();
+  node_ids.push_back(37);
+  node_ids.push_back(96);
+  // std::vector<std::string> feature_names;
+  feature_names.clear();
+  feature_names.push_back(std::string("a"));
+  feature_names.push_back(std::string("b"));
+  node_feat =
+      client1.get_node_feat(std::string("user"), node_ids, feature_names);
+  ASSERT_EQ(node_feat.size(), 2);
+  ASSERT_EQ(node_feat[0].size(), 2);
+  VLOG(0) << "get_node_feat: " << node_feat[0][0].size();
+  VLOG(0) << "get_node_feat: " << node_feat[0][1].size();
+  VLOG(0) << "get_node_feat: " << node_feat[1][0].size();
+  VLOG(0) << "get_node_feat: " << node_feat[1][1].size();
+
+  std::remove(edge_file_name);
+  std::remove(node_file_name);
+  LOG(INFO) << "Run stop_server";
+  worker_ptr_->stop_server();
+  LOG(INFO) << "Run finalize_worker";
+  worker_ptr_->finalize_worker();
+  testFeatureNodeSerializeInt();
+  testFeatureNodeSerializeInt64();
+  testFeatureNodeSerializeFloat32();
+  testFeatureNodeSerializeFloat64();
+  testGraphToBuffer();
+  client1.stop_server();
+}
+
+void testGraphToBuffer() {
+  ::paddle::distributed::GraphNode s, s1;
+  s.set_feature_size(1);
+  s.set_feature(0, std::string("hhhh"));
+  s.set_id(65);
+  int size = s.get_size(true);
+  char str[size];
+  s.to_buffer(str, true);
+  s1.recover_from_buffer(str);
+  ASSERT_EQ(s.get_id(), s1.get_id());
+  VLOG(0) << s.get_feature(0);
+  VLOG(0) << s1.get_feature(0);
+}
+
+TEST(RunBrpcPushSparse, Run) { RunBrpcPushSparse(); }
diff --git a/paddle/fluid/distributed/test/sparse_table_test.cc b/paddle/fluid/distributed/test/sparse_table_test.cc
index 6db95c5fac211b..26bede392d6fad 100644
--- a/paddle/fluid/distributed/test/sparse_table_test.cc
+++ b/paddle/fluid/distributed/test/sparse_table_test.cc
@@ -55,9 +55,14 @@ TEST(CommonSparseTable, SGD) {
 
   // pull parameters for create and check
   std::vector<uint64_t> init_keys = {0, 1, 2, 3, 4};
+  std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1};
+
   std::vector<float> init_values;
   init_values.resize(init_keys.size() * emb_dim);
-  table->pull_sparse(init_values.data(), init_keys.data(), init_keys.size());
+
+  std::vector<float> pull_values(init_values.size());
+  auto value = PullSparseValue(init_keys, init_fres, emb_dim);
+  table->pull_sparse(init_values.data(), value);
 
   // for check
   std::vector<float> total_gradients;
@@ -100,7 +105,8 @@ TEST(CommonSparseTable, SGD) {
 
   std::vector<float> pull_values;
   pull_values.resize(init_keys.size() * emb_dim);
-  table->pull_sparse(pull_values.data(), init_keys.data(), init_keys.size());
+  table->pull_sparse(init_values.data(), value);
+
   for (size_t i = 0; i < init_values.size(); ++i) {
     auto update_val = init_values[i] - 1.0 * total_gradients[i];
     ASSERT_TRUE(abs(update_val - pull_values[i]) < 1e-5);
@@ -148,9 +154,13 @@ TEST(CommonSparseTable, Adam) {
 
   // pull parameters for create and check
   std::vector<uint64_t> init_keys = {0, 1, 2, 3, 4};
+  std::vector<uint32_t> init_fres = {1, 1, 1, 1, 1};
+
   std::vector<float> init_values;
   init_values.resize(init_keys.size() * emb_dim);
-  table->pull_sparse(init_values.data(), init_keys.data(), init_keys.size());
+
+  auto value = PullSparseValue(init_keys, init_fres, emb_dim);
+  table->pull_sparse(init_values.data(), value);
 
   // push gradient
   std::vector<std::vector<uint64_t>> trainer_keys;
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 34c87b8388975a..5636e3ed1b63f9 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -161,9 +161,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
 #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
     !defined(_WIN32) && !defined(__APPLE__)
     AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
-#else
-    LOG(WARNING) << "fusion_group is not enabled for Windows/MacOS now, and "
-                    "only effective when running with CUDA GPU.";
 #endif
     AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_,
                         "fuse_elewise_add_act_pass");
@@ -265,12 +262,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     if (FLAGS_use_mkldnn) {
       AppendPass(pass_name);
     } else if (!strategy_.mkldnn_enabled_op_types_.empty()) {
-      LOG(WARNING)
-          << "mkldnn_enabled_op_types specify the operator type list to "
-             "use MKLDNN acceleration. It is null in default, means "
-             "that all the operators supported by MKLDNN will be "
-             "accelerated. And it should not be set when "
-             "FLAGS_use_mkldnn=false.";
+      VLOG(1) << "mkldnn_enabled_op_types specify the operator type list to "
+                 "use MKLDNN acceleration. It is null in default, means "
+                 "that all the operators supported by MKLDNN will be "
+                 "accelerated. And it should not be set when "
+                 "FLAGS_use_mkldnn=false.";
     }
 #else
     PADDLE_ENFORCE_NE(FLAGS_use_mkldnn, true,
@@ -403,26 +399,26 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
               << ", num_trainers:" << num_trainers_;
     } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") {
       if (use_device != p::kCUDA) {
-        LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on "
-                        "GPU, skipped.";
+        VLOG(1) << "fuse_relu_depthwise_conv_pass is only supported on "
+                   "GPU, skipped.";
         continue;
       }
     } else if (pass->Type() == "fusion_group_pass") {
       pass->Set<bool>("use_gpu", new bool((use_device == p::kCUDA)));
       if (use_device != p::kCUDA) {
-        LOG(WARNING) << "fusion_group_pass is only supported on GPU, skipped.";
+        VLOG(1) << "fusion_group_pass is only supported on GPU, skipped.";
         continue;
       }
     } else if (pass->Type() == "fuse_bn_act_pass") {
       if (use_device != p::kCUDA) {
-        LOG(WARNING) << "fuse_bn_act_pass is only supported on "
-                        "GPU, skipped.";
+        VLOG(1) << "fuse_bn_act_pass is only supported on "
+                   "GPU, skipped.";
         continue;
       }
     } else if (pass->Type() == "fuse_bn_add_act_pass") {
       if (use_device != p::kCUDA) {
-        LOG(WARNING) << "fuse_bn_add_act_pass is only supported on "
-                        "GPU, skipped.";
+        VLOG(1) << "fuse_bn_add_act_pass is only supported on "
+                   "GPU, skipped.";
         continue;
       }
     } else if (pass->Type() == "mkldnn_placement_pass") {
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 05c54a90f7eb02..9ced4221e1dd6c 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -205,7 +205,7 @@ class DeviceWorker {
   Scope* root_scope_ = nullptr;
   Scope* thread_scope_;
   paddle::platform::Place place_;
-  int64_t batch_num_;
+  int64_t batch_num_ = 0;
   FetchConfig fetch_config_;
   bool use_cvm_;
   bool no_cvm_;
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
old mode 100644
new mode 100755
index b36793507f54bf..6363eedc80a20f
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -29,9 +29,18 @@ message RecomputeConfig {
 }
 
 message ShardingConfig {
-  optional float fuse_broadcast_MB = 1 [ default = 32.0 ];
-  optional bool hybrid_dp = 2 [ default = false ];
-  optional int32 sharding_group_size = 3 [ default = 8 ];
+  optional string sharding_segment_strategy = 1
+      [ default = 'segment_broadcast_MB' ];
+  optional float segment_broadcast_MB = 2 [ default = 32.0 ];
+  repeated string segment_anchors = 3;
+  optional int32 sharding_degree = 4 [ default = 8 ];
+  optional int32 mp_degree = 5 [ default = 1 ];
+  optional int32 dp_degree = 6 [ default = 1 ];
+  optional bool hybrid_dp = 7 [ default = false ];
+  optional int32 gradient_merge_acc_step = 8 [ default = 1 ];
+  optional bool optimize_offload = 9 [ default = false ];
+  optional bool pp_allreduce_in_optimize = 10 [ default = false ];
+  optional int32 pp_degree = 11 [ default = 1 ];
 }
 
 message AMPConfig {
@@ -152,6 +161,7 @@ message DistributedStrategy {
   optional bool fp16_allreduce = 25 [ default = false ];
   optional bool sharding = 26 [ default = false ];
   optional float last_comm_group_size_MB = 27 [ default = 1 ];
+  optional bool find_unused_parameters = 28 [ default = true ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 61f3c026f1facc..ce0a905afc6285 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -42,5 +42,5 @@ cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_conte
 cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
 
 if(WITH_ASCEND)
-    cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend ascend_graph)
+    cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph)
 endif(WITH_ASCEND)
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h
index da79fccb8ca69f..baa2fd126a4b77 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@@ -37,25 +37,50 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-// typedef std::vector<std::string> AscendGraphDesc;
 typedef ge::Graph AscendGraphDesc;
 
+#ifdef PADDLE_WITH_ASCEND_STRING
+using AscendString = ge::AscendString;
+#else
+using AscendString = std::string;
+#endif
+
 class AscendInstance {
  public:
   virtual ~AscendInstance() {}
   AscendInstance() {}
 
-  std::map<std::string, std::string> GetDefaultInitSessionOptions() {
-    std::map<std::string, std::string> init_options;
-    init_options["a"] = "b";
-    init_options["ge.trainFlag"] = "1";
+  std::map<AscendString, AscendString> _GetDefaultInitOptions() {
+    std::map<AscendString, AscendString> init_options;
+    init_options["ge.exec.deviceId"] = "0";
+    init_options["ge.graphRunMode"] = "1";
+    return init_options;
+  }
+
+  std::map<AscendString, AscendString> _GetDefaultInitSessionOptions() {
+    std::map<AscendString, AscendString> init_options;
+    // init_options["a"] = "b";
+    // init_options["ge.trainFlag"] = "1";
     return init_options;
   }
 
-  // add other parameters here to init
+  ge::Status InitGEForUT() {
+    return ge::GEInitialize(_GetDefaultInitOptions());
+  }
+
   void InitGlobalResouces() {
-    session_.reset(new ge::Session(GetDefaultInitSessionOptions()));
-    VLOG(1) << "InitGlobalResouces Done";
+    LOG(INFO) << "Begin ascend InitGlobalResouces";
+    session_.reset(new ge::Session(_GetDefaultInitSessionOptions()));
+    if (session_ == nullptr) {
+      PADDLE_THROW(platform::errors::Fatal("new session error: nullptr"));
+    }
+    LOG(INFO) << "End ascend InitGlobalResouces";
+  }
+
+  void DestroyGlobalResouces() {
+    LOG(INFO) << "Begin ascend DestroyGlobalResouces";
+    session_ = nullptr;
+    LOG(INFO) << "Begin ascend DestroyGlobalResouces";
   }
 
   static std::shared_ptr<AscendInstance> GetInstance() {
@@ -178,6 +203,6 @@ class AscendInstance {
  private:
   static std::shared_ptr<AscendInstance> ascend_instance_;
 };
-}  // end namespace framework
-}  // end namespace paddle
+}  // namespace framework
+}  // namespace paddle
 #endif
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index d8639643f2c8a7..89dc5c7d3ea932 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <ctime>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
@@ -226,14 +227,32 @@ void HogwildWorker::PrintFetchVars() {
   // call count
   batch_num_++;
   int batch_per_print = fetch_config_.print_period();
-  if (thread_id_ == 0) {
-    if (batch_num_ % batch_per_print == 0) {
-      int fetch_var_num = fetch_config_.fetch_var_names_size();
-      for (int i = 0; i < fetch_var_num; ++i) {
-        platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i),
-                           fetch_config_.fetch_var_str_format(i));
+  int fetch_var_num = fetch_config_.fetch_var_names_size();
+
+  if (fetch_var_num == 0) {
+    return;
+  }
+
+  if (thread_id_ == 0 && batch_num_ % batch_per_print == 0) {
+    time_t curtime;
+    time(&curtime);
+    char mbstr[80];
+    std::strftime(mbstr, sizeof(mbstr), "%Y-%m-%d %H:%M:%S",
+                  std::localtime(&curtime));
+
+    std::stringstream ss;
+    ss << "time: [" << mbstr << "], ";
+    ss << "batch: [" << batch_num_ << "], ";
+
+    for (int i = 0; i < fetch_var_num; ++i) {
+      platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i),
+                         fetch_config_.fetch_var_str_format(i), &ss);
+      if (i < fetch_var_num - 1) {
+        ss << ", ";
       }
     }
+
+    std::cout << ss.str() << std::endl;
   }
 }
 
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index ada20113077c18..0e63320f2f7adb 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -141,14 +141,6 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance,
                               fused_pattern);
 
-    // check if is in ernie or not
-    if (!graph->Has(kEmbEltwiseLayernormPass) ||
-        !graph->Has(kMultiheadMatmulPass)) {
-      LOG(INFO) << "The skip_layernorm_fuse_pass is only supported in "
-                << "Ernie/Bert model. Just skip this pass.";
-      return;
-    }
-
     std::unordered_set<const Node *> del_node_set;
 
     // Create an SkipLayerNorm op node
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index a97fc2e75aab14..5968df548dfb0f 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -71,37 +71,16 @@ void PipelineTrainer::CopyParameters(int microbatch_id,
                                      const ProgramDesc& program,
                                      const platform::Place& place) {
   auto& global_block = program.Block(0);
-  std::map<std::string, int> param_map;
-  for (auto& var : global_block.AllVars()) {
-    if (var->Persistable()) {
-      param_map[var->Name()] = 1;
-    }
-  }
 
   for (auto& var : global_block.AllVars()) {
-    bool is_param_grad = false;
-    size_t pos = 0;
-    // A magic suffix to indicate the merged gradient
-    std::string magicSuffix = std::string(kGradVarSuffix) + "@MERGED";
-    if ((pos = var->Name().find(magicSuffix)) != std::string::npos) {
-      auto prefix_name = var->Name().substr(0, pos);
-      if (param_map.find(prefix_name) != param_map.end()) {
-        is_param_grad = true;
-      }
-    }
     if (var->Persistable() && microbatch_id == 0) {
       auto* ptr = root_scope_->Var(var->Name());
       InitializeVariable(ptr, var->GetType());
-      VLOG(3) << "Create persistable var: " << var->Name()
-              << ", which pointer is " << ptr;
-    } else if (is_param_grad && microbatch_id == 0) {
-      auto* ptr = minibatch_scope_->Var(var->Name());
-      InitializeVariable(ptr, var->GetType());
-      VLOG(3) << "Create grad for persistable var: " << var->Name()
+      VLOG(5) << "Create persistable var: " << var->Name()
               << ", which pointer is " << ptr;
-    } else if (!var->Persistable() && !is_param_grad) {
+    } else if (!var->Persistable()) {
       auto* ptr = microbatch_scopes_[microbatch_id]->Var(var->Name());
-      VLOG(3) << "Create variable " << var->Name() << " for microbatch "
+      VLOG(5) << "Create variable " << var->Name() << " for microbatch "
               << microbatch_id << ", which pointer is " << ptr;
       InitializeVariable(ptr, var->GetType());
     }
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 29ba54986801f1..2a439a6f1ea81a 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -36,48 +36,73 @@ DECLARE_bool(sort_sum_gradient);
 namespace paddle {
 namespace imperative {
 
-void BasicEngine::Init(VarBase* var, bool retain_graph) {
+void BasicEngine::Init(
+    const std::vector<std::shared_ptr<VarBase>>& tensors,
+    const std::vector<std::shared_ptr<VarBase>>& grad_tensors,
+    bool retain_graph) {
   retain_graph_ = retain_graph;
-  init_node_ = var->GradVarBase()->GradNode();
-  PADDLE_ENFORCE_EQ(var->GradVarBase()->GraphIsFreed(), false,
-                    platform::errors::Unavailable(
-                        "%s trying to backward through the same graph a second "
-                        "time, but this graph have already been freed. Please "
-                        "specify Tensor.backward(retain_graph=True) when "
-                        "calling backward at the first time.",
-                        var->Name()));
-
-  if (!retain_graph) {
-    VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name()
-            << " because of retain_graph=False when calling backward";
-    var->GradVarBase()->SetGraphIsFreed(true);
-    var->GradVarBase()->ClearGradNode();
-  }
 
-  if (init_node_ == nullptr || var->OverridedStopGradient()) {
-    VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
-               "stop_gradient=True: "
-            << var->Name();
-    return;
-  }
+  PADDLE_ENFORCE_EQ(
+      tensors.size(), grad_tensors.size(),
+      platform::errors::Unavailable(
+          "The size of tensors do not equal the size of grad_tensors,"
+          "the size of tensors is %s, but the size of grad_tensors is %s.",
+          tensors.size(), grad_tensors.size()));
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    auto var = tensors[i];
+    auto grad_tensor = grad_tensors[i];
+
+    auto init_node = var->GradVarBase()->GradNode();
+    PADDLE_ENFORCE_EQ(
+        var->GradVarBase()->GraphIsFreed(), false,
+        platform::errors::Unavailable(
+            "%s trying to backward through the same graph a second "
+            "time, but this graph have already been freed. Please "
+            "specify Tensor.backward(retain_graph=True) when "
+            "calling backward at the first time.",
+            var->Name()));
+
+    if (!retain_graph) {
+      VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name()
+              << " because of retain_graph=False when calling backward";
+      var->GradVarBase()->SetGraphIsFreed(true);
+      var->GradVarBase()->ClearGradNode();
+    }
+
+    if (init_node == nullptr || var->OverridedStopGradient()) {
+      VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
+                 "stop_gradient=True: "
+              << var->Name();
+      continue;
+    }
 
-  VLOG(3) << "Init first node of backward";
+    VLOG(3) << "Init node of backward";
+
+    PADDLE_ENFORCE_EQ(
+        var->HasGradVar(), true,
+        platform::errors::NotFound("Tensor %s has no gradient", var->Name()));
+
+    auto& fwd_var = var->Var().Get<framework::LoDTensor>();
+    auto* grad_var =
+        var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
+    VLOG(6) << "init loss grad:" << var->GradVarBase()->Name()
+            << " as stop_gradient false";
+    var->GradVarBase()->InnerSetOverridedStopGradient(false);
+    auto* dev_ctx =
+        platform::DeviceContextPool::Instance().Get(fwd_var.place());
+    if (grad_tensor == nullptr) {
+      grad_var->Resize(fwd_var.dims());
+      grad_var->mutable_data(fwd_var.place(), fwd_var.type());
+      operators::math::set_constant(*dev_ctx, grad_var, 1.0);
+    } else {
+      paddle::framework::TensorCopy(
+          grad_tensor->Var().Get<framework::LoDTensor>(), fwd_var.place(),
+          *dev_ctx, grad_var);
+    }
 
-  PADDLE_ENFORCE_EQ(
-      var->HasGradVar(), true,
-      platform::errors::NotFound("Grad variable not exist for variable %s",
-                                 var->Name()));
-
-  auto& fwd_var = var->Var().Get<framework::LoDTensor>();
-  auto* grad_var =
-      var->GradVarBase()->MutableVar()->GetMutable<framework::LoDTensor>();
-  VLOG(6) << "init loss grad:" << var->GradVarBase()->Name()
-          << " as stop_gradient false";
-  var->GradVarBase()->InnerSetOverridedStopGradient(false);
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place());
-  grad_var->Resize(fwd_var.dims());
-  grad_var->mutable_data(fwd_var.place(), fwd_var.type());
-  operators::math::set_constant(*dev_ctx, grad_var, 1.0);
+    init_nodes_.push_back(init_node);
+  }
 }
 
 void BasicEngine::CheckBackwardInputs(const OpBase& op) {
@@ -141,17 +166,6 @@ void BasicEngine::PrepareGradAccumulators(
                 << var.get()
                 << ") that don't have grad node  with reference count "
                 << accumulator->RefCnt();
-
-        if (var->HasLeafHooks()) {
-          VLOG(3) << "Grad variable wrapper (" << var->Name()
-                  << ") has leaf grad hooks.";
-          PADDLE_ENFORCE_NE(
-              var->HasGradNode(), true,
-              platform::errors::PermissionDenied(
-                  "Only leaf Tensor's gradient can append hook to "
-                  "Gradientaccumulator."));
-          accumulator->SetPostHooks(var->GetLeafHooks());
-        }
       } else {
         // Because Inplace op overwrites the grad_node of the input grad_var. So
         // only the information of grad_pending_node can be used to find the
@@ -235,8 +249,10 @@ void BasicEngine::PrepareDeps() {
   std::queue<GradOpNode*> q;
   std::unordered_set<GradOpNode*> visited;
 
-  q.push(init_node_.get());
-  visited.insert(init_node_.get());
+  for (size_t i = 0; i < init_nodes_.size(); ++i) {
+    q.push(init_nodes_[i].get());
+    visited.insert(init_nodes_[i].get());
+  }
 
   while (!q.empty()) {
     auto* cur_node = q.front();
@@ -262,15 +278,41 @@ void BasicEngine::PrepareDeps() {
   }
 }
 
+static std::shared_ptr<NameVarMap<VariableWrapper>> CallGradientHooks(
+    const NameVarMap<VariableWrapper>& bwd_ins, const std::string& op_type) {
+  std::shared_ptr<NameVarMap<VariableWrapper>> tmp_ins_ptr = nullptr;
+  for (const auto& pair : bwd_ins) {
+    for (size_t i = 0; i < pair.second.size(); ++i) {
+      auto& var = pair.second[i];
+      if (var->HasHook()) {
+        if (tmp_ins_ptr == nullptr) {
+          tmp_ins_ptr = std::make_shared<NameVarMap<VariableWrapper>>(bwd_ins);
+        }
+        VLOG(3) << "Call " << var->GetHooks().size() << " hooks of " << op_type
+                << "'s input `" << pair.first << "`'s var `" << var->Name()
+                << "`.";
+        auto tmp_var = var;
+        for (const auto& hook_pair : var->GetHooks()) {
+          tmp_var = (*hook_pair.second)(tmp_var);
+        }
+        (*tmp_ins_ptr)[pair.first][i] = tmp_var;
+      }
+    }
+  }
+  return tmp_ins_ptr;
+}
+
 void BasicEngine::Execute() {
-  if (init_node_ == nullptr) {
+  if (init_nodes_.empty()) {
     return;
   }
 
   PrepareDeps();
   // Start execute Computation graph
   std::queue<std::shared_ptr<GradOpNode>> q;
-  q.push(std::move(init_node_));
+  for (size_t i = 0; i < init_nodes_.size(); ++i) {
+    q.push(std::move(init_nodes_[i]));
+  }
 
   size_t op_num = 0;
 
@@ -292,10 +334,15 @@ void BasicEngine::Execute() {
       auto& bwd_ins = cur_op.GetInsMap();
       auto& bwd_outs = cur_op.GetOutsMap();
 
+      /**
+       * [ Why need temporary outputs here? ]
+       *
+       * - construct the temp output map, avoid to disrupt graph
+       * - replace the element in the map by temp var, because a
+       *   var may be coresponding to several grad var in one op
+       */
       NameVarMap<VariableWrapper> tmp_outs(bwd_outs);
-      // 1. construct the temp output map, avoid to disrupt graph
-      // 2. replace the element in the map by temp var, because a
-      // var may be coresponding to several grad var in one op
+
       for (auto& pair : tmp_outs) {
         if (!pair.second.IsGrad()) {
           continue;
@@ -408,10 +455,28 @@ void BasicEngine::Execute() {
         }
       }
 
+      /**
+       * [ Why need temporary inputs here? ]
+       *
+       * - Hook execution should not change original input tensor.
+       *   User can register hook for Tensor's gradient, It is expected
+       *   that the hook only affects the gradient of the backward
+       *   propagation, and does not affect the gradient value input
+       *   as the hook.
+       * - use `tmp_ins_ptr`, only copy bwd_ins when the var in bwd_ins
+       *   hold hooks
+       */
+      auto tmp_ins_ptr = CallGradientHooks(bwd_ins, cur_op.Type());
+
       {
         VLOG(3) << "Start to execute grad op " << cur_op.Type();
-        OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(),
-                    cur_op.place());
+        if (tmp_ins_ptr == nullptr) {
+          OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(),
+                      cur_op.place());
+        } else {
+          OpBase::Run(cur_op.InnerOp(), *tmp_ins_ptr, tmp_outs, cur_op.Attrs(),
+                      cur_op.place());
+        }
       }
 
       for (auto& pair : inplace_output_grad_var_list_) {
@@ -428,15 +493,14 @@ void BasicEngine::Execute() {
         if (!accumulator->SumGradCompleted()) {
           continue;
         }
-        // 1. Call Hooks for **inner_var_**
+        // 1. Call Hooks for `inner_var_`
+        accumulator->CallGradientHooks();
 
-        // 2. Sum Gradient with Previous Graph
+        // 2. Sum Gradient `inner_var_` to `var_` of Current or Previous Graph
         accumulator->AccumulateGrad();
 
-        // 3. Call backward Hooks for **var_**
-        if (accumulator->HasPostHooks()) {
-          accumulator->CallBackwardPostHooks();
-        }
+        // 3. Call backward Hooks for `var_`
+        accumulator->CallReduceHooks();
       }
 
       need_accu_var_list_.clear();
@@ -470,7 +534,7 @@ void BasicEngine::Execute() {
 }
 
 void BasicEngine::Clear() {
-  init_node_.reset();
+  init_nodes_.clear();
   node_deps_.clear();
   accumulators_.clear();
   accumulators_with_grad_node_.clear();
diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
index a2ad8b5f8aa61e..49761a8df0b6b1 100644
--- a/paddle/fluid/imperative/basic_engine.h
+++ b/paddle/fluid/imperative/basic_engine.h
@@ -30,7 +30,9 @@ class OpBase;
 
 class BasicEngine : public Engine {
  public:
-  void Init(VarBase* var, bool retain_graph = false);
+  void Init(const std::vector<std::shared_ptr<VarBase>>& tensors,
+            const std::vector<std::shared_ptr<VarBase>>& grad_tensors,
+            bool retain_graph = false);
 
   void Execute() override;
 
@@ -46,7 +48,7 @@ class BasicEngine : public Engine {
   void Clear();
 
  private:
-  std::shared_ptr<GradOpNode> init_node_;
+  std::vector<std::shared_ptr<GradOpNode>> init_nodes_;
   std::unordered_map<GradOpNode*, size_t> node_deps_;
   // The input and output of Inplace op are the same. If only `var` is used
   // as the key, then the input and output of inplace op must be gradient
diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc
index 873068a0d310dc..16f9454e9376e4 100644
--- a/paddle/fluid/imperative/bkcl_context.cc
+++ b/paddle/fluid/imperative/bkcl_context.cc
@@ -19,12 +19,11 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/bkcl_helper.h"
 #include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/gen_comm_id_helper.h"
-
-#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/fluid/string/string_helper.h"
@@ -77,7 +76,7 @@ void BKCLParallelContext::Init() {
   bkcl_ids.resize(strategy_.nrings_);
 
   if (strategy_.local_rank_ == 0) {
-    // generate the unique ncclid on the root worker
+    // generate the unique bkclid on the root worker
     for (size_t i = 0; i < bkcl_ids.size(); ++i) {
       auto ret = bkcl_get_unique_id(&bkcl_ids[i]);
       PADDLE_ENFORCE_EQ(BKCL_SUCCESS, ret,
@@ -99,6 +98,28 @@ void BKCLParallelContext::Init() {
   }
 }
 
+void BKCLParallelContext::InitWithRingID(int ring_id) {
+  std::vector<BKCLUniqueId> bkcl_ids;
+  bkcl_ids.resize(1);
+
+  if (strategy_.local_rank_ == 0) {
+    // generate the unique bkclid on the root worker
+    auto ret = bkcl_get_unique_id(&bkcl_ids[0]);
+    PADDLE_ENFORCE_EQ(BKCL_SUCCESS, ret,
+                      platform::errors::PreconditionNotMet(
+                          "BKCL get unique id failed [%d]", ret));
+  }
+  BcastBKCLId(bkcl_ids, 0);
+
+  int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
+  VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_
+          << " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id
+          << " ring id: " << ring_id;
+  // it will assign bkcl_comm in XPUDeviceContext within ring_id
+  platform::BKCLCommContext::Instance().CreateBKCLComm(
+      &bkcl_ids[0], strategy_.nranks_, strategy_.local_rank_, xpu_id, ring_id);
+}
+
 void BKCLParallelContext::AllReduceByStream(const framework::Variable &src,
                                             framework::Variable *dst,
                                             int ring_id, bool use_calc_stream) {
@@ -146,8 +167,6 @@ void BKCLParallelContext::WaitCompute(int ring_id) {
       platform::errors::OutOfRange("Ring id expected < nrings,"
                                    "but got ring id = %d, nrings = %d",
                                    ring_id, strategy_.nrings_));
-  // TODO(wangxi16): [Performance optimize] Maybe need to put Wait and
-  // bkcl_allreduce to comm thread, for bkcl_allreduce is blocking now.
   auto compute_dev_ctx = static_cast<platform::XPUDeviceContext *>(
       platform::DeviceContextPool::Instance().Get(place_));
   compute_dev_ctx->Wait();
@@ -167,6 +186,12 @@ void BKCLParallelContext::WaitComm(int ring_id) {
   comm_dev_ctx->Wait();
 }
 
+void BKCLParallelContext::SynchronizeCompute() {
+  auto compute_dev_ctx = static_cast<platform::XPUDeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(place_));
+  compute_dev_ctx->Wait();
+}
+
 }  //  namespace imperative
 }  //  namespace paddle
 #endif
diff --git a/paddle/fluid/imperative/bkcl_context.h b/paddle/fluid/imperative/bkcl_context.h
index d7d917f20082ac..652b7689666c6c 100644
--- a/paddle/fluid/imperative/bkcl_context.h
+++ b/paddle/fluid/imperative/bkcl_context.h
@@ -36,6 +36,8 @@ class BKCLParallelContext : public ParallelContext {
 
   void Init() override;
 
+  void InitWithRingID(int ring_id) override;
+
   void AllReduceByStream(const framework::Variable& src,
                          framework::Variable* dst, int ring_id,
                          bool use_calc_stream) override;
@@ -45,6 +47,8 @@ class BKCLParallelContext : public ParallelContext {
   void WaitCompute(int ring_id) override;
 
   void WaitComm(int ring_id) override;
+
+  void SynchronizeCompute() override;
 };
 
 }  //  namespace imperative
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index b9df88b1f1eeaa..df5ff750c9902f 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -384,8 +384,8 @@ static platform::Place GetPlaceOfVar(
 
 void GradientAccumulator::AccumulateGrad() {
   /**
-   * If the gradient has been calculated by previous graph,
-   * it should be added to the previous graph result.
+   * If the leaf gradient has been calculated done, the inner_var_
+   * should be added to the var_.
    */
   if (!var_->IsLeafGrad() || !SumGradCompleted() || !HasInnerVar()) {
     return;
@@ -396,7 +396,7 @@ void GradientAccumulator::AccumulateGrad() {
                         "this auto-grad"));
   PADDLE_ENFORCE_EQ(inner_var_->Var().IsInitialized(), true,
                     platform::errors::InvalidArgument(
-                        "Interior var of Leaf tensor  should be initialized."));
+                        "Interior var of Leaf tensor should be initialized."));
   auto* src = inner_var_->MutableVar();
   auto* dst = var_->MutableVar();
   if (!var_->IsEmpty()) {
@@ -427,10 +427,65 @@ void GradientAccumulator::AccumulateGrad() {
     *(dst) = std::move(*src);
     var_->SetType(inner_var_->Type());
     var_->SetDataType(inner_var_->DataType());
+    var_->SetIsEmpty(false);
   }
   inner_var_.reset();
 }
 
+void GradientAccumulator::CallGradientHooks() {
+  PADDLE_ENFORCE_EQ(var_->IsLeafGrad(), true,
+                    platform::errors::Unavailable(
+                        "Only leaf gradient Tensor can deal with by gradient "
+                        "hook in gradient accumulator."));
+  PADDLE_ENFORCE_EQ(
+      SumGradCompleted(), true,
+      platform::errors::PreconditionNotMet(
+          "Only can call gradient hooks after sum gradient completed."));
+  PADDLE_ENFORCE_EQ(
+      HasInnerVar(), true,
+      platform::errors::PreconditionNotMet(
+          "Leaf Tensor's inner var is nullptr when call gradient hook."));
+  PADDLE_ENFORCE_EQ(
+      inner_var_->Var().IsInitialized(), true,
+      platform::errors::PreconditionNotMet("Leaf Tensor's inner var "
+                                           "is not initialized when "
+                                           "call gradient hook."));
+  if (var_->HasHook()) {
+    VLOG(3) << "Call " << var_->GetHooks().size()
+            << " hooks of leaf gradient accumulator's inner var `"
+            << var_->Name() << "`.";
+    auto tmp_var = inner_var_;
+    VLOG(3) << "Input var " << var_->Name() << "'s hook size - "
+            << var_->GetHooks().size();
+    for (const auto& hook_pair : var_->GetHooks()) {
+      tmp_var = (*hook_pair.second)(tmp_var);
+    }
+    inner_var_ = tmp_var;
+  }
+}
+
+void GradientAccumulator::CallReduceHooks() {
+  PADDLE_ENFORCE_EQ(
+      var_->IsLeafGrad(), true,
+      platform::errors::Unavailable("Only leaf gradient Tensor can deal with "
+                                    "by reduce hook in gradient accumulator."));
+  PADDLE_ENFORCE_EQ(SumGradCompleted(), true,
+                    platform::errors::PreconditionNotMet(
+                        "Only can call reduce hooks after the gradient "
+                        "summation is completed in current batch."));
+  PADDLE_ENFORCE_EQ(HasInnerVar(), false,
+                    platform::errors::PreconditionNotMet(
+                        "Only can call reduce hooks after the "
+                        "gradient accumulation is completed in "
+                        "current batch or across batchs."));
+  if (var_->HasMutableHook()) {
+    for (const auto& hook : var_->GetMutableHooks()) {
+      VLOG(3) << "call gradient accumulator backward hooks.";
+      (*hook)(var_);
+    }
+  }
+}
+
 void EagerGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
                                        size_t trace_id, bool unchange_input) {
   /**
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
index e2dabc06a7dae6..6411dce4405c11 100644
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@@ -40,8 +40,8 @@ class GradientAccumulator {
     }
 
     // inner_var_ record the grad of this auto-grad.
-    // Only need to generate inner var for non-empty leaf-tensor.
-    if (var->IsLeafGrad() && !var->IsEmpty()) {
+    // Only need to generate inner var for leaf-tensor.
+    if (var->IsLeafGrad()) {
       inner_var_ = std::make_shared<VariableWrapper>(var->Name());
       inner_var_->SetType(var->Type());
       inner_var_->SetDataType(var->DataType());
@@ -52,9 +52,6 @@ class GradientAccumulator {
               << ") to store result of this Graph";
     }
 
-    // TODO(zhouwei): fix Tensor.clear_gradient() bug, remove this hard flag
-    var->SetIsEmpty(false);
-
     // var_ is the final grad, processed by hooks and grad accumulation
     var_ = var;
   }
@@ -93,42 +90,38 @@ class GradientAccumulator {
 
   inline bool HasInnerVar() const { return inner_var_ != nullptr; }
 
-  /* Hook related methods */
-  inline bool HasPostHooks() const { return !post_hooks_.expired(); }
-
-  void SetPostHooks(const std::shared_ptr<LeafVarHookPipeline>& hooks) {
-    PADDLE_ENFORCE_NOT_NULL(
-        hooks, platform::errors::InvalidArgument(
-                   "The hook set to GradientAccumulator is nullptr."));
-
-    auto shared_hooks = post_hooks_.lock();
-    if (shared_hooks != hooks) {
-      PADDLE_ENFORCE_EQ(
-          shared_hooks, nullptr,
-          platform::errors::PermissionDenied(
-              "Cannot set post hooks twice to GradientAccumulator."));
-      post_hooks_ = hooks;
-    }
-  }
-  // void CallHooks(){}
-  //  ** inner_var_ **
-
   // function that Sum Gradient with Previous Graph
   void AccumulateGrad();
 
-  // call backward post hooks, such as reduce hook
-  void CallBackwardPostHooks() {
-    PADDLE_ENFORCE_NE(
-        post_hooks_.expired(), true,
-        platform::errors::NotFound(
-            "The post hooks of GradientAccumulator for Tensor `%s` expired.",
-            var_->Name()));
-    auto shared_hooks = post_hooks_.lock();
-    for (const auto& hook : shared_hooks->backward_hooks()) {
-      VLOG(3) << "call gradient accumulator backward hooks.";
-      (*hook)(var_);
-    }
-  }
+  /** [ Hook related methods ]
+   *
+   *  [Why need two types of VariableWrapperHook? ]
+   *
+   *    There are two types of gradient accumulation:
+   *    1. Gradient accumulation in same batch
+   *    2. Gradient accumulation across batchs
+   *    The order of execution between Hooks and gradient accumulation:
+
+   *      [ Gradient accumulation in same batch]
+   *                        |
+   *            [ leaf GradVarBase hooks ]
+   *                        |
+   *      [ Gradient accumulation across batchs ]
+   *                        |
+   *          [ Gradient reduce / allreduce hooks ]
+
+   *    Because we currently intend to accumulate these two gradient
+   *    accumulation in one GradientAccumulator, We must distinguish between
+   *    two types of hooks.
+
+   *    And the InplaceVariableWrapperHook does not allow users to register
+   *    directly, and is currently only used to support the reduce strategy of
+   *    parallel multi-card training.
+   */
+
+  void CallGradientHooks();
+
+  void CallReduceHooks();
 
  protected:
   VariableWrapper* var_;
@@ -137,7 +130,6 @@ class GradientAccumulator {
   std::shared_ptr<VariableWrapper> inner_var_;
   size_t ref_cnt_{0};
   size_t cur_cnt_{0};
-  std::weak_ptr<LeafVarHookPipeline> post_hooks_;
 };
 
 class EagerGradientAccumulator : public GradientAccumulator {
diff --git a/paddle/fluid/imperative/hooks.h b/paddle/fluid/imperative/hooks.h
index 1211ec6ae6c7bd..4d59298aed51f1 100644
--- a/paddle/fluid/imperative/hooks.h
+++ b/paddle/fluid/imperative/hooks.h
@@ -18,100 +18,67 @@
 #include <memory>
 #include <utility>
 #include <vector>
-
-#include "paddle/fluid/imperative/type_defs.h"
-#include "paddle/fluid/platform/macros.h"
-
 namespace paddle {
 namespace imperative {
 
 class VariableWrapper;
 
-/** [ Basic hook classes ]
- * s
- * @brief OpBasePreHook is executed before the grad OpBase is executed,
+/** [ Const VariableWrapper Hook: Pre hook functor of OpBase ]
+ *
+ * @brief This hook functor is executed before the grad OpBase is executed,
  *        taking the input of the current grad OpBase as input, and
  *        executing python hooks (user-defined) or C++ hooks (developer-defined)
  *        to achieve the purpose of custom operations on the interior VarBase
  *        gradient.
  *
- * @note  OpBasePreHook will not change the input gradient VarBase.
+ * @note  This hook functor will not change the input gradient VarBase.
  *
  * @note  [Why need to be OpBase `PreHook`, why not `PostHook`?]
  *
- *        If set OpBase post hook, when the op executed end, the op's output
- *        gradient may not be the final state, because it may need other op's
- *        gradient output to accumulated to it. But before op can be executed,
- *        the gradient output must have been accumulated to final value.
+ *        1. We expect If set OpBase post hook, when the op executed end, the
+ *        op's output gradient may not be the final state, because it may need
+ *        other op's gradient output to accumulated to it. But before op can
+ *        be executed, the gradient output must have been accumulated to final
+ *        value.
+ *        2. We don’t want the hook to change its input Tensor value, so now
+ *        we can't call all hooks in GradAccumulator.
  *
  * @note  [Why only can be used for interior VarBase?]
  *
  *        Because the leaf VarBase's GradVarBase has no GradOpNode, so leaf
  *        GradVarBase has no next OpBase to executed, so if need to deal with
- *        the leaf GradVarBase, cannot use OpBasePreHook. For this case, we
- *        deal with by GradAccumulatorPostHook.
+ *        the leaf GradVarBase, cannot use this hook functor. For this case, we
+ *        deal with by other inplace hook method.
  */
-class OpBasePreHook {
+class VariableWrapperHook {
  public:
-  virtual ~OpBasePreHook() = default;
-  virtual VariableWrapperList operator()(
-      const VariableWrapperList& grad_inputs) = 0;
+  virtual ~VariableWrapperHook() = default;
+  virtual std::shared_ptr<VariableWrapper> operator()(
+      const std::shared_ptr<VariableWrapper>& var) = 0;
 };
 
-/**
- * @brief GradAccumulatorPostHook is the Hook that operates on the current
+/** [ Inplace VariableWrapper Hook: Post hook functor of GradAccumulator ]
+ *
+ * @brief This hook functor is the Hook that operates on the current
  *        gradientafter the GradientAccumulator has accumulated the gradient.
  *        Leaf GradVarBase has no next OpBase, if we want to register hook
  *        for it, we also need to wait until the leaf GradVarBase accumulation
  *        is completed, so we can add post hook to GradientAccumulator.
  *
- * @note  GradAccumulatorPostHook will change the grad VarBase value.
+ * @note  This hook functor will change the grad VarBase value.
  *
- * @note  Only allow leaf VarBase hold GradientAccumulatorPostHook.
+ * @note  Only allow leaf VarBase hold call this hook functor.
  */
-class GradAccumulatorPostHook {
+class InplaceVariableWrapperHook {
  public:
-  virtual ~GradAccumulatorPostHook() = default;
+  virtual ~InplaceVariableWrapperHook() = default;
   virtual void operator()(VariableWrapper* var) = 0;
 };
 
-/** [ Hook for cpp functions ]
- *
- * Here we design three C++ hooks；
- * 1. CppOpBasePreHook (Implement later):
- *    - used for developer-defined C++ interior VarBase hooks
- * 2. CppGradAccumulatorPostHook (Implement later):
- *    - used for developer-defined C++ leaf VarBase hooks
- * 3. LambdaGradAccumulatorPostHook:
- *    - used for VarBase reduce in parallel training
- *
- * @note  [Why need two types of GradAccumulatorPostHook? ]
- *
- *        There are two types of gradient accumulation:
- *        1. Gradient accumulation in same batch
- *        2. Gradient accumulation across batchs
- *        The order of execution between Hooks and gradient accumulation:
- *
- *          [ Gradient accumulation in same batch]
- *                            |
- *                [ leaf GradVarBase hooks ]
- *                            |
- *          [ Gradient accumulation across batchs ]
- *                            |
- *              [ Gradient reduce / allreduce]
- *
- *        Because we currently intend to accumulate these two gradient
- *        accumulation in one GradientAccumulator, We must distinguish between
- *        two types of hooks.
- *
- *        And the LambdaGradAccumulatorPostHook does not allow users to register
- *        directly, and is currently only used to support the reduce strategy of
- *        parallel multi-card training.
- */
-class LambdaGradAccumulatorPostHook : public GradAccumulatorPostHook {
+class LambdaInplaceVariableWrapperHook : public InplaceVariableWrapperHook {
  public:
-  explicit LambdaGradAccumulatorPostHook(
-      std::function<void(VariableWrapper*)> fn)
+  explicit LambdaInplaceVariableWrapperHook(
+      std::function<void(VariableWrapper*)>&& fn)
       : fn_(std::move(fn)) {}
 
   void operator()(VariableWrapper* var) override { fn_(var); }
@@ -120,114 +87,5 @@ class LambdaGradAccumulatorPostHook : public GradAccumulatorPostHook {
   std::function<void(VariableWrapper*)> fn_;
 };
 
-/* Hooks for python function: in pybind/imperative.cc */
-
-/** Add Python Hooks later:
- * - PyOpBasePreHook (Implement later): used for user-defined interior python
- * VarBase hooks
- * - PyGradAccumulatorPostHook (Implement later): used for user-defined leaf
- * python VarBase hooks
- */
-
-/** [ Hook Pipeline classes ]
- *
- * @note  [Why need hook pipeline classes?]
- *
- *        There are 2 purposes for adding Hook pipeline here:
- *
- *        1. Make the code implementation cleaner.
- *
- *          If there are no Hook pipeline, we need to add 3 hook vector into
- *          VariableWrapper, 1 hook vector into OpBase, 2 hook vector into
- *          GradientAccumulator, like:
- *
- *          - VariableWrapper:
- *            std::vector<std::shared_ptr<OpBasePreHook>>
- *              interior_var_hooks_;
- *            std::vector<std::shared_ptr<GradAccumulatorPostHook>>
- *              leaf_var_hooks_;
- *            std::vector<std::shared_ptr<GradAccumulatorPostHook>>
- *              backward_hooks_;
- *
- *          - OpBase:
- *            std::vector<std::weak_ptr<OpBasePreHook>>
- *              interior_var_hooks_;
- *
- *          - GradientAccumulator:
- *            std::vector<std::weak_ptr<GradAccumulatorPostHook>>
- *              leaf_var_hooks_;
- *            std::vector<std::weak_ptr<GradAccumulatorPostHook>>
- *              backward_hooks_;
- *
- *          This seems more complicated, and std::vector<std::weak_ptr<...>>
- *          is not easy to destruct.
- *
- *        2. Make the code easier to understand.
- *
- *          From these two packages, we can clearly understand that we
- *          have two types of Hooks, respectively for the interior
- *          gradient var and leaf gradient var inside the backward
- *          calculation graph.
- */
-
-class InteriorVarHookPipeline {
- public:
-  InteriorVarHookPipeline() = default;
-
-  void add_hook(std::unique_ptr<OpBasePreHook>&& hook) {
-    hooks_.emplace_back(std::move(hook));
-  }
-
-  const std::vector<std::unique_ptr<OpBasePreHook>>& hooks() const {
-    return hooks_;
-  }
-
-  std::vector<std::unique_ptr<OpBasePreHook>>& hooks() { return hooks_; }
-
- private:
-  std::vector<std::unique_ptr<OpBasePreHook>> hooks_;
-
-  DISABLE_COPY_AND_ASSIGN(InteriorVarHookPipeline);
-};
-
-class LeafVarHookPipeline {
- public:
-  LeafVarHookPipeline() = default;
-
-  void add_hook(std::unique_ptr<GradAccumulatorPostHook>&& hook) {
-    hooks_.emplace_back(std::move(hook));
-  }
-
-  const std::vector<std::unique_ptr<GradAccumulatorPostHook>>& hooks() const {
-    return hooks_;
-  }
-
-  std::vector<std::unique_ptr<GradAccumulatorPostHook>>& hooks() {
-    return hooks_;
-  }
-
-  void add_backward_hook(std::unique_ptr<GradAccumulatorPostHook>&& hook) {
-    backward_hooks_.emplace_back(std::move(hook));
-  }
-
-  const std::vector<std::unique_ptr<GradAccumulatorPostHook>>& backward_hooks()
-      const {
-    return backward_hooks_;
-  }
-
-  std::vector<std::unique_ptr<GradAccumulatorPostHook>>& backward_hooks() {
-    return backward_hooks_;
-  }
-
- private:
-  std::vector<std::unique_ptr<GradAccumulatorPostHook>> hooks_;
-  // NOTE: the `backward` here means the `whole backward process`,
-  // the `backward_hooks_` need to be executed after the `whole backward
-  // process`.
-  std::vector<std::unique_ptr<GradAccumulatorPostHook>> backward_hooks_;
-
-  DISABLE_COPY_AND_ASSIGN(LeafVarHookPipeline);
-};
-
 }  // namespace imperative
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index ff5a780a5f9dbf..f87db415768a18 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -30,6 +30,7 @@
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/flags.h"
+#include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/saved_variable_wrapper_list.h"
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/imperative/variable_wrapper.h"
@@ -220,6 +221,26 @@ class VarBase {
 
   void BumpInplaceVersion();
 
+  /* Hook related method: now only used for GradVarBase */
+  bool HasHook() const { return var_->HasHook(); }
+
+  int64_t AddHook(std::shared_ptr<VariableWrapperHook>&& hook) {
+    return var_->AddHook(
+        std::forward<std::shared_ptr<VariableWrapperHook>>(hook));
+  }
+
+  bool RemoveHook(const int64_t& hook_id) { return var_->RemoveHook(hook_id); }
+
+  const std::map<int64_t, std::shared_ptr<VariableWrapperHook>>& GetHooks()
+      const {
+    return var_->GetHooks();
+  }
+
+  void AddMutableHook(std::shared_ptr<InplaceVariableWrapperHook>&& hook) {
+    var_->AddMutableHook(
+        std::forward<std::shared_ptr<InplaceVariableWrapperHook>>(hook));
+  }
+
  private:
   /**
    * NOTE(zengjinle): never remove the const qualifier of `var_` if you are
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index eb0135d15e0743..b91fc460781c79 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -79,6 +79,30 @@ void NCCLParallelContext::Init() {
   }
 }
 
+void NCCLParallelContext::InitWithRingID(int ring_id) {
+  std::vector<ncclUniqueId> nccl_ids;
+  nccl_ids.resize(1);
+
+  if (strategy_.local_rank_ == 0) {
+    // generate the unique ncclid on the root worker
+    platform::dynload::ncclGetUniqueId(&nccl_ids[0]);
+  }
+  BcastNCCLId(nccl_ids, 0);
+
+  int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device;
+  VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
+          << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
+          << " ring id: " << ring_id;
+  // it will assign nccl_comm in CUDADeviceContext within ring_id
+  platform::NCCLCommContext::Instance().CreateNCCLComm(
+      &nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id);
+
+  compute_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
+      BOOST_GET_CONST(platform::CUDAPlace, place_).device));
+  comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
+      BOOST_GET_CONST(platform::CUDAPlace, place_).device));
+}
+
 void NCCLParallelContext::AllReduceByStream(const framework::Variable &src,
                                             framework::Variable *dst,
                                             int ring_id, bool use_calc_stream) {
@@ -149,6 +173,12 @@ void NCCLParallelContext::WaitComm(int ring_id) {
 #endif
 }
 
+void NCCLParallelContext::SynchronizeCompute() {
+  auto *compute_dev_ctx = static_cast<platform::CUDADeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(place_));
+  compute_dev_ctx->Wait();
+}
+
 #endif
 
 }  //  namespace imperative
diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h
index 51e5743aebdc3d..bcaeb811b108c5 100644
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@@ -53,6 +53,8 @@ class NCCLParallelContext : public ParallelContext {
 
   void Init() override;
 
+  void InitWithRingID(int ring_id) override;
+
   void AllReduceByStream(const framework::Variable& src,
                          framework::Variable* dst, int ring_id,
                          bool use_calc_stream) override;
@@ -63,6 +65,8 @@ class NCCLParallelContext : public ParallelContext {
 
   void WaitComm(int ring_id) override;
 
+  void SynchronizeCompute() override;
+
  private:
   // used for comm wait compute, compute_stream-->event-->comm_stream[ring_id]
   std::vector<std::shared_ptr<platform::CudaEventObject>> compute_events_;
diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h
index 2b7642ae7cfd92..0164ff9313cdfe 100644
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
@@ -177,8 +177,6 @@ class OpBase {
   std::unique_ptr<framework::OperatorBase> op_;
   platform::Place place_;
   size_t id_{-1UL};
-
-  std::weak_ptr<InteriorVarHookPipeline> pre_hooks_;
 };
 
 class GradOpNode {
diff --git a/paddle/fluid/imperative/parallel_context.h b/paddle/fluid/imperative/parallel_context.h
index ef0a9604092151..f537a316014d60 100644
--- a/paddle/fluid/imperative/parallel_context.h
+++ b/paddle/fluid/imperative/parallel_context.h
@@ -50,6 +50,8 @@ class ParallelContext {
 
   virtual void Init() = 0;
 
+  virtual void InitWithRingID(int ring_id) = 0;
+
   virtual void AllReduceByStream(const framework::Variable& src,
                                  framework::Variable* dst, int ring_id,
                                  bool use_calc_stream) = 0;
@@ -64,6 +66,9 @@ class ParallelContext {
   // if CPU, should do nothing.
   virtual void WaitComm(int ring_id) = 0;
 
+  // synchorize compute stream
+  virtual void SynchronizeCompute() = 0;
+
   inline int GetNRings() const { return strategy_.nrings_; }
 
   inline int64_t GetNRanks() const { return strategy_.nranks_; }
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 8dd8cafc835ab1..3da3a05ed1071c 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -369,6 +369,10 @@ class GradientAccumulationInfo {
     *is_finished = (cur_ref_cnt_ == total_ref_cnt_);
     accumulator_->SumGrad(grad_var_partial, trace_id, unchange_input);
 
+    if (*is_finished && accumulator_->HasInnerVar()) {
+      accumulator_->AccumulateGrad();
+    }
+
     if (create_graph_) {
       VLOG(10) << "Store partial grad grad for double grad "
                << mapped_grad_var_->Name();
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index e8b531d35cabfc..5422b7ce9c8552 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -310,13 +310,17 @@ Reducer::Reducer(const std::vector<std::shared_ptr<imperative::VarBase>> &vars,
   for (size_t global_var_index = 0; global_var_index < vars_.size();
        ++global_var_index) {
     auto var = vars_[global_var_index];
-    var->SharedVar()->AddGradVarLeafBackwardHook(
-        std::unique_ptr<LambdaGradAccumulatorPostHook>(
-            new LambdaGradAccumulatorPostHook([=](VariableWrapper *grad) {
-              this->AddDistHook(global_var_index);
-            })));
+    var->GradVarBase()->AddMutableHook(
+        std::make_shared<LambdaInplaceVariableWrapperHook>([=](
+            VariableWrapper *grad) { this->AddDistHook(global_var_index); }));
     var_index_map_[var->GradVarBase()->SharedVar().get()] = global_var_index;
   }
+
+  // for checking var is ready once
+  vars_marked_ready_.resize(vars_.size(), false);
+
+  // Initialize local used vars
+  local_used_vars_.resize(vars_.size(), 0);
 }
 
 void Reducer::InitializeDenseGroups(
@@ -325,7 +329,7 @@ void Reducer::InitializeDenseGroups(
   for (size_t index = 0; index < variable_indices_.size(); ++index) {
     const auto variable_index = variable_indices_[index];
     const auto &var = vars_[variable_index];
-    const auto var_name = var->Name();
+    const auto &var_name = var->Name();
     PADDLE_ENFORCE_EQ(is_sparse_gradient_[variable_index], false,
                       platform::errors::PreconditionNotMet(
                           "Tensor %s's GRAD must be LoDTensor, but received "
@@ -336,7 +340,7 @@ void Reducer::InitializeDenseGroups(
     PADDLE_ENFORCE_EQ(lod_tensor->IsInitialized(), true,
                       platform::errors::PreconditionNotMet(
                           "Tensor %s is not initialized.", var_name));
-    auto size = lod_tensor->numel();
+    const auto size = lod_tensor->numel();
     PADDLE_ENFORCE_GT(
         size, 0, platform::errors::PreconditionNotMet(
                      "The number of tensor %s's elements is 0.", var_name));
@@ -348,8 +352,8 @@ void Reducer::InitializeDenseGroups(
     p_group->dense_tensors_.push_back(framework::Tensor());
 
     // check the dtype and place, it must be same.
-    auto dtype = var->DataType();
-    auto place = var->Place();
+    const auto &dtype = var->DataType();
+    const auto &place = var->Place();
     if (index > 0) {
       PADDLE_ENFORCE_EQ(
           dtype, p_group->dtype_,
@@ -419,8 +423,7 @@ void Reducer::InitializeGroups(
     group.variable_indices_ = std::move(variable_indices_);
     groups_.emplace_back(std::move(group));
     // Debug Message For Reducer
-    VLOG(3) << "The Group[" << group_index << "]:";
-    VLOG(3) << groups_.back();
+    VLOG(3) << "The Group[" << group_index << "]:" << groups_.back();
   }
 }
 
@@ -463,34 +466,38 @@ void Reducer::PrepareDeps(const std::unordered_set<GradOpNode *> &init_nodes) {
 // and allreudce sequence counter(next_group_) will be cleaned up again.
 void Reducer::PrepareForBackward(
     const std::vector<std::shared_ptr<imperative::VarBase>> &outputs) {
-  VLOG(3) << "start reseting count..";
+  VLOG(3) << "after forward, then reset count for backward.";
   next_group_ = 0;
   std::for_each(groups_.begin(), groups_.end(), [](Group &group) {
     group.pending_ = group.variable_indices_.size();
     group.sparse_contents_ = nullptr;
   });
 
+  // reinitialize vars_marked_ready_ for next iteration
+  vars_marked_ready_.clear();
+  vars_marked_ready_.resize(vars_.size(), false);
+
   PADDLE_ENFORCE_EQ(
-      all_group_ready_, false,
+      groups_need_finalize_, false,
       platform::errors::PreconditionNotMet(
-          "Please note that all forward outputs derived from the module "
+          "A serious error has occurred here. There may be several reasons: "
+          "1) Please note that all forward outputs derived from the module "
           "parameters must participate in the calculation of losses and "
           "subsequent gradient calculations. If not, the wrapper will hang, "
           "waiting for autograd to generate gradients for these parameters. "
           "you can use detach or stop_gradient to make the unused parameters "
-          "detached from the autograd graph."));
+          "detached from the autograd graph. "
+          "2) Used multiple forwards and one backward. You may be able to wrap "
+          "multiple forwards in a model."));
 
   // The first var to trigger the unused parameter
   has_marked_unused_vars_ = false;
+  unused_vars_.clear();
+
   if (!find_unused_vars_) {
     return;
   }
 
-  // TODO(shenliang03) "find_unused_vars" interface will be exposed in the
-  // future to handle control flow to process unused parameters
-  find_unused_vars_ = false;
-
-  unused_vars_.clear();
   node_deps_.clear();
   std::queue<std::shared_ptr<GradOpNode>> q;
   std::unordered_set<VariableWrapper *> var_visited;
@@ -553,6 +560,23 @@ void Reducer::PrepareForBackward(
               << "] is not used";
     }
   }
+
+  if (unused_vars_.empty()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "All parameters are involved in the backward pass. "
+           "It is recommended to set find_unused_parameters to False "
+           "to improve performance. However, if unused parameters "
+           "appear in subsequent iterative training, then an error "
+           "will occur. Please make it clear that in the subsequent "
+           "training, there will be no parameters that are not used "
+           "in the backward pass, and then set find_unused_parameters";
+  } else if (unused_vars_.size() == vars_.size()) {
+    LOG_FIRST_N(WARNING, 1)
+        << "There is no parameter in the device involved "
+           "in the backward calculation. If there are "
+           "parameters on other devices involved in the "
+           "backward, then a serious error will occur here.";
+  }
 }
 
 // Add hook function to each leaf node. When the gradient of a leaf node is
@@ -565,67 +589,133 @@ void Reducer::PrepareForBackward(
 // concat + allreduce + split is emitted in turn according to next_group_.
 // 3, FinalizeBackward: after the end, synchronize each stream.
 void Reducer::AddDistHook(size_t var_index) {
+  PADDLE_ENFORCE_LT(var_index, variable_locators_.size(),
+                    platform::errors::OutOfRange(
+                        "Out of bounds variable index. it must be less"
+                        "than %d, but it is %d",
+                        variable_locators_.size(), var_index));
+
   VLOG(3) << "Var[" << var_index << "] ["
           << vars_[var_index]->GradVarBase()->Name()
           << "] arrived and triggered disthook";
-  if (!has_marked_unused_vars_) {
-    has_marked_unused_vars_ = true;
-    for (auto unused_index : unused_vars_) {
-      if (NeedRebuildGroup()) {
-        rebuild_vars_.push_back(vars_[unused_index]);
-        rebuild_var_indices_.push_back(unused_index);
-      }
-      MarkVarReady(unused_index, false);
-    }
-  }
 
+  local_used_vars_[var_index] = 1;
+
+  // rebuild group when find_unused_vars_ is false
   if (NeedRebuildGroup()) {
     rebuild_vars_.push_back(vars_[var_index]);
     rebuild_var_indices_.push_back(var_index);
   }
+
+  if (!has_marked_unused_vars_ && find_unused_vars_) {
+    has_marked_unused_vars_ = true;
+    for (const auto &unused_index : unused_vars_) {
+      MarkVarReady(unused_index, false);
+    }
+  }
+
   MarkVarReady(var_index, true);
 }
 
 void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
-  all_group_ready_ = true;
+  groups_need_finalize_ = true;
+
   const auto &var_locator = variable_locators_[var_index];
-  auto group_index = var_locator.group_index;
+  const auto group_index = var_locator.group_index;
   auto &group = groups_[group_index];
 
+  // error happened, if the var is ready before.
+  if (vars_marked_ready_[var_index]) {
+    auto error_info = string::Sprintf(
+        "Error happened, when parameter[%d][%s] has been ready before. "
+        "There may be several reasons for this error: "
+        "1) In multiple reentrant backward phase, some parameters are reused."
+        "2) Using model parameters outside of forward function. Please "
+        "make sure that model parameters are not shared in concurrent "
+        "forward-backward passes.",
+        var_index, vars_[var_index]->GradVarBase()->Name());
+
+    PADDLE_ENFORCE_EQ(has_marked_unused_vars_, false,
+                      platform::errors::PreconditionNotMet(error_info));
+
+    error_info +=
+        "3) Unused parameters retrieval is incorrect. "
+        "The return value of forward will be used to retrieve"
+        " the unused parameters of the entire model. These "
+        "gradients of unused parameters will not be synchronized "
+        "between multiple cards. However, if the unused "
+        "parameters participate in the backward calculation "
+        "again at a later time (e.g. after the forward function, "
+        "the loss calculation uses the unused "
+        "paramters of the forward and trigger backward), "
+        "its gradient will be wrong.";
+
+    PADDLE_ENFORCE_EQ(has_marked_unused_vars_, true,
+                      platform::errors::PreconditionNotMet(error_info));
+  } else {
+    vars_marked_ready_[var_index] = true;
+  }
+
   if (!group.is_sparse_) {
     // process dense group
-    auto inside_group_index = var_locator.inside_group_index;
-    auto length = group.length_[inside_group_index];
+    const auto inside_group_index = var_locator.inside_group_index;
+    const auto length = group.length_[inside_group_index];
     auto &group_tensor = group.dense_tensors_[inside_group_index];
+
     if (is_used_var) {
-      auto var_warpper = vars_[var_index]->GradVarBase()->SharedVar();
-      auto tensor =
-          var_warpper->MutableVar()->GetMutable<framework::LoDTensor>();
+      auto var_base = vars_[var_index]->GradVarBase();
+      auto tensor = var_base->MutableVar()->GetMutable<framework::LoDTensor>();
       group_tensor.ShareDataWith(*tensor).Resize(
           {static_cast<int64_t>(length)});
     } else {
+      // TODO(shenliang03): maybe save the memory
+      // by avoiding tensor construction
       if (!group_tensor.IsInitialized()) {
         group_tensor.Resize({static_cast<int64_t>(length)});
         group_tensor.mutable_data(place_, group.dtype_);
+      }
+
 #ifdef PADDLE_WITH_XPU_BKCL
-        if (platform::is_xpu_place(group_tensor.place())) {
-          // TODO(liuyuhui) support XPU set constant
-          VLOG(3) << "XPU doesn't support set_constant";
-        }
+      if (platform::is_xpu_place(group_tensor.place())) {
+        // TODO(liuyuhui) support XPU set constant
+        VLOG(3) << "XPU doesn't support set_constant";
+      }
 #else
-        auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
+      auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
+      if (HasGrad(var_index)) {
+        auto var_base = vars_[var_index]->GradVarBase();
+        auto tensor =
+            var_base->MutableVar()->GetMutable<framework::LoDTensor>();
+        TensorCopy(*tensor, place_, *dev_ctx, &group_tensor);
+        group_tensor.Resize({static_cast<int64_t>(length)});
+      } else {
+        group_tensor.Resize({static_cast<int64_t>(length)});
         operators::math::set_constant(*dev_ctx, &group_tensor, 0.0);
-#endif
       }
+#endif
     }
   } else {
     // process sparse group
-    if (is_used_var) {
-      auto var_warpper = vars_[var_index]->GradVarBase()->SharedVar();
-      group.sparse_contents_ = var_warpper->MutableVar();
-    } else {
-      group.sparse_contents_ = nullptr;
-    }
+    PADDLE_ENFORCE_EQ(HasGrad(var_index), true,
+                      platform::errors::PreconditionNotMet(
+                          "The sparse parameter[%d][%s] must have a gradient",
+                          var_index, vars_[var_index]->Name()));
+    auto var_base = vars_[var_index]->GradVarBase();
+    // need to check tensor type
+    PADDLE_ENFORCE_EQ(
+        var_base->Var().IsType<framework::SelectedRows>(), true,
+        platform::errors::PreconditionNotMet(
+            "The sparse parameter[%d][%s] must have a selectedrows gradient. "
+            "Before forward pass, the parameter type is inferred to be "
+            "SelectedRows, but after backward pass, its actual type becomes "
+            "LodTensor. It is currently not supported by DataParallel. "
+            "For example, if sparse embedding is used, and the weight of "
+            "embedding is shared with subsequent dense parameters, then "
+            "the parameter gradient of the embedding will be converted "
+            "to dense parameters.",
+            var_index, vars_[var_index]->Name()));
+
+    group.sparse_contents_ = var_base->MutableVar();
   }
 
   if (--group.pending_ == 0) {
@@ -641,6 +731,14 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
 // TODO(liuyuhui): If BKCL support non-blocking communication, it should be
 // fixed as same as multi gpus card trainging.
 void Reducer::MarkGroupReady(size_t group_index) {
+  PADDLE_ENFORCE_GE(
+      group_index, next_group_,
+      platform::errors::PreconditionNotMet(
+          "The index of the incoming group must be greater "
+          "than or equal to the previously synchronized group index, "
+          "expect it to greater than or equal to %d, but got %d.",
+          next_group_, group_index));
+
   if (group_index > next_group_) {
     VLOG(3) << "It will adjust the order of group in next batch automatically";
     return;
@@ -649,7 +747,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
   for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0;
        ++next_group_) {
     auto &group = groups_[next_group_];
-    int run_order = next_group_ % nrings_;
+    const int run_order = next_group_ % nrings_;
 
     // For CUDA or XPU, compute_stream --> comm_stream.
     // For CPU, do nothing.
@@ -668,7 +766,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
     comm_pool_->enqueue([&] {
       auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
       platform::SetXPUDeviceId(dev_id);
-      FusedAllReduceSchedule(run_order, group);
+      FusedAllReduceSchedule(run_order, group, next_group_);
       {
         std::lock_guard<std::mutex> lock(mutex_);
         comm_op_count_ -= 1;  // lock
@@ -676,7 +774,7 @@ void Reducer::MarkGroupReady(size_t group_index) {
       }
     });
 #elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
-    FusedAllReduceSchedule(run_order, group);
+    FusedAllReduceSchedule(run_order, group, next_group_);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "Not compiled with BKCL or NCCL."));
@@ -684,24 +782,23 @@ void Reducer::MarkGroupReady(size_t group_index) {
   }
 }
 
-void Reducer::FusedAllReduceSchedule(int run_order, Group &group) {
+void Reducer::FusedAllReduceSchedule(const int run_order, Group &group,
+                                     const int curr_group_index) {
+  // The overall timeline: concat > div_nranks > allreduce > split
+  // dev_context is used to select different stream
+  const auto &dev_context = *parallel_ctx_->GetDeviceContext(run_order);
   if (group.is_sparse_) {
-    if (group.sparse_contents_ != nullptr) {
-      VLOG(3) << "sparse group [" << next_group_ << "] start allreduce in ring["
-              << run_order << "]";
-      group.DivNRanks(*parallel_ctx_->GetDeviceContext(run_order), nranks_);
-      parallel_ctx_->AllReduceByStream(
-          *group.sparse_contents_, group.sparse_contents_, run_order, false);
-    } else {
-      VLOG(3) << "The sparse group[" << next_group_
-              << "] has no var to allreduce";
-    }
+    VLOG(3) << "sparse group [" << curr_group_index
+            << "] start allreduce in ring[" << run_order << "]";
+    group.DivNRanks(dev_context, nranks_);
+    parallel_ctx_->AllReduceByStream(*group.sparse_contents_,
+                                     group.sparse_contents_, run_order, false);
   } else {
-    VLOG(3) << "dense group [" << next_group_ << "] start allreduce in ring["
-            << run_order << "]";
+    VLOG(3) << "dense group [" << curr_group_index
+            << "] start allreduce in ring[" << run_order << "]";
     // Select common commstream to concat tensors
     // group.dense_tensors ---> group.dense_contents_
-    group.ConcatTensors(*parallel_ctx_->GetDeviceContext(run_order));
+    group.ConcatTensors(dev_context);
 
 // NOTE(liuyuhui): ConcatTensors use communication stream, but BKCL only support
 // default stream for communicating, so there exist some problems in
@@ -713,15 +810,15 @@ void Reducer::FusedAllReduceSchedule(int run_order, Group &group) {
       parallel_ctx_->WaitComm(run_order);
     }
 #endif
-    group.DivNRanks(*parallel_ctx_->GetDeviceContext(run_order), nranks_);
 
+    group.DivNRanks(dev_context, nranks_);
     // Start allreduce
     parallel_ctx_->AllReduceByStream(
         group.dense_contents_, &(group.dense_contents_), run_order, false);
 
-    // Select common commstream to split tensors
+    // Select communication stream to split tensors
     // group.dense_contents_ ---> group.dense_tensors
-    group.SplitTensors(*parallel_ctx_->GetDeviceContext(run_order));
+    group.SplitTensors(dev_context);
   }
 }
 
@@ -747,14 +844,98 @@ std::vector<std::vector<size_t>> Reducer::RebuildGruops() {
   return rebuild_group_indices;
 }
 
+void Reducer::ProcessUnusedDenseVars() {
+  // The calculation stream must be used here to
+  // avoid conflicts with communication.
+  VLOG(3) << "Local used vars : "
+          << string::join_strings(local_used_vars_, ',');
+  const auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
+  // H2D is to allreduce the local_used_vars_
+  auto *global_used_tensor =
+      global_used_vars_.GetMutable<framework::LoDTensor>();
+  framework::TensorFromVector<int>(local_used_vars_, *dev_ctx,
+                                   global_used_tensor);
+  parallel_ctx_->AllReduceByStream(global_used_vars_, &global_used_vars_, 0,
+                                   true);
+  framework::TensorToVector<int>(*global_used_tensor, *dev_ctx,
+                                 &local_used_vars_);
+
+  // sync compute stream to get global used var message,
+  // but maybe affect speed performance
+  parallel_ctx_->SynchronizeCompute();
+  VLOG(3) << "Global used vars : "
+          << string::join_strings(local_used_vars_, ',');
+
+  for (const auto var_index : unused_vars_) {
+    const bool global_unused = (local_used_vars_[var_index] == 0);
+
+    // global used but local unused, set grad
+    VLOG(3) << "Var [" << var_index << "] [" << vars_[var_index]->Name()
+            << "] global_unused:" << global_unused
+            << "  has grad: " << HasGrad(var_index);
+
+    if (!global_unused) {
+      VLOG(3) << "Start process unused Var";
+      // 1. source var base
+      const auto &var_locator = variable_locators_[var_index];
+      const auto group_index = var_locator.group_index;
+      const auto &group = groups_[group_index];
+      const auto inside_group_index = var_locator.inside_group_index;
+      const auto &src_tensor = group.dense_tensors_[inside_group_index];
+      // sparse no need to check and no support find_unused_parameters
+      if (group.is_sparse_) {
+        continue;
+      }
+      // 2. destination var base
+      auto dest_var_base = vars_[var_index];
+      auto *dest_tensor =
+          dest_var_base->MutableVar()->GetMutable<framework::LoDTensor>();
+      const auto &dest_dims = dest_tensor->dims();
+
+      // 3. create grad var base or get grad var base
+      auto grad_var_base_tmp = dest_var_base->MutableGradVarBase();
+
+      // 4. set grad tensor
+      auto *dest_grad_tensor =
+          grad_var_base_tmp->MutableVar()->GetMutable<framework::LoDTensor>();
+      const auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_);
+      TensorCopy(src_tensor, place_, *dev_ctx, dest_grad_tensor);
+      dest_grad_tensor->Resize(dest_dims);
+    }
+  }
+}
+
+bool Reducer::HasGrad(size_t var_index) {
+  const auto grad_var = vars_[var_index]->GradVarBase();
+  if (!grad_var || !grad_var->Var().IsInitialized()) {
+    return false;
+  }
+
+  const auto &var = grad_var->Var();
+  if (var.IsType<framework::LoDTensor>()) {
+    if (var.Get<framework::LoDTensor>().IsInitialized()) {
+      return true;
+    }
+  } else if (var.IsType<framework::SelectedRows>()) {
+    if (var.Get<framework::SelectedRows>().value().IsInitialized()) {
+      return true;
+    }
+  } else {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Only support LoDTensor and SelectedRows for gradient var"));
+  }
+  return false;
+}
+
 void Reducer::FinalizeBackward() {
-  all_group_ready_ = false;
+  groups_need_finalize_ = false;
 #ifdef PADDLE_WITH_XPU_BKCL
   {
     std::unique_lock<std::mutex> lock(mutex_);
     cv_.wait(lock, [&] { return comm_op_count_ == 0; });
   }
 #endif
+
   // Must prevent compute_stream_ starting until all comm streams have finished
   for (int i = 0; i < nrings_; ++i) {
     parallel_ctx_->WaitComm(i);
@@ -767,7 +948,18 @@ void Reducer::FinalizeBackward() {
     InitializeGroups(group_indices_);
   }
 
-  VLOG(3) << "In the batch, Reducer is finished...";
+  if (find_unused_vars_) {
+// TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    ProcessUnusedDenseVars();
+#endif
+    // Initialize local used vars
+    local_used_vars_.clear();
+    local_used_vars_.resize(vars_.size(), 0);
+    VLOG(3) << "ProcessUnusedDenseVars is finished.";
+  }
+
+  VLOG(3) << "In the batch, Reducer is finished.";
 }
 
 // According to the size of each parameter, it is allocated to different groups.
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index b2680d0dea71aa..0d613dbea89633 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -27,6 +27,7 @@
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
@@ -153,13 +154,20 @@ class Reducer {
 
   void MarkGroupReady(size_t group_index);
 
-  void FusedAllReduceSchedule(int run_order, Group& group);  // NOLINT
+  void FusedAllReduceSchedule(const int run_order, Group& group,  // NOLINT
+                              const int curr_group_index);
 
   void FinalizeBackward();
 
   std::vector<std::vector<size_t>> RebuildGruops();
 
-  inline bool NeedRebuildGroup() { return !has_rebuilt_group_; }
+  inline bool NeedRebuildGroup() {
+    return !has_rebuilt_group_ && !find_unused_vars_;
+  }
+
+  void ProcessUnusedDenseVars();
+
+  bool HasGrad(size_t var_index);
 
  private:
   std::vector<std::shared_ptr<imperative::VarBase>> vars_;
@@ -188,7 +196,7 @@ class Reducer {
   std::vector<size_t> unused_vars_;
   bool has_marked_unused_vars_{false};
   bool find_unused_vars_{false};
-  bool all_group_ready_{false};
+  bool groups_need_finalize_{false};
 #ifdef PADDLE_WITH_XPU_BKCL
   // comm_pool_ is used for scheduling allreduce in multi Kunlun cards training.
   std::unique_ptr<::ThreadPool> comm_pool_{nullptr};
@@ -196,6 +204,19 @@ class Reducer {
   std::mutex mutex_;
   std::condition_variable cv_;
 #endif
+
+  // it just for checking hook, each parameter can only trigger one hook
+  std::vector<bool> vars_marked_ready_;
+
+  // Following variables are to help control flow.
+  // local_used_vars_ uses 0/1 to indicate whether the
+  // var is used in iteration. After the end of the
+  // iteration, global_used_vars_ is obtained synchronously
+  // globally. Choose whether to update the local
+  // gradient according to the global_used_vars_.
+  std::vector<int> local_used_vars_;
+  // global_used_vars_ is used in comm stream to avoid wait
+  framework::Variable global_used_vars_;
 };
 
 std::vector<std::vector<size_t>> AssignGroupBySize(
diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc
index 7bf5f876681bab..8c907b9890652a 100644
--- a/paddle/fluid/imperative/tests/test_hooks.cc
+++ b/paddle/fluid/imperative/tests/test_hooks.cc
@@ -74,16 +74,15 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
   mul_attr_map["use_mkldnn"] = false;
 
   // add GradAccumulatorPostHook
-  auto x_var_wrapper = x->SharedVar();
-  x_var_wrapper->AddGradVarLeafBackwardHook(
-      std::unique_ptr<LambdaGradAccumulatorPostHook>(
-          new LambdaGradAccumulatorPostHook([=](VariableWrapper* grad) {
+  x->GradVarBase()->AddMutableHook(
+      std::make_shared<LambdaInplaceVariableWrapperHook>(
+          [=](VariableWrapper* grad) {
             auto* grad_tensor =
                 grad->MutableVar()->GetMutable<framework::LoDTensor>();
             for (int i = 0; i < grad_tensor->numel(); ++i) {
               grad_tensor->mutable_data<float>(place)[i] *= 2.0;
             }
-          })));
+          }));
 
   // 2. forward
   tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true);
@@ -93,8 +92,10 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) {
   ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL);
 
   // 3. backward
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{out};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
   BasicEngine engine;
-  engine.Init(out.get());
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
   framework::LoDTensor x_grad;
@@ -151,17 +152,16 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
   memory::Copy(place, mutable_z, place, src_data.data(),
                sizeof(float) * src_data.size());
 
-  // add GradAccumulatorPostHook
-  auto x_var_wrapper = x->SharedVar();
-  x_var_wrapper->AddGradVarLeafBackwardHook(
-      std::unique_ptr<LambdaGradAccumulatorPostHook>(
-          new LambdaGradAccumulatorPostHook([=](VariableWrapper* grad) {
+  // add ReduceBackwardHook
+  x->GradVarBase()->AddMutableHook(
+      std::make_shared<LambdaInplaceVariableWrapperHook>(
+          [=](VariableWrapper* grad) {
             auto* grad_tensor =
                 grad->MutableVar()->GetMutable<framework::LoDTensor>();
             for (int i = 0; i < grad_tensor->numel(); ++i) {
               grad_tensor->mutable_data<float>(place)[i] *= 2.0;
             }
-          })));
+          }));
 
   // 2. forward
   var_pair x_pair = var_pair("X", vb_vector(1, x));
@@ -193,8 +193,10 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() {
   ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL);
 
   // 3. backward
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{out};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
   BasicEngine engine;
-  engine.Init(out.get());
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
   framework::LoDTensor x_grad;
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index 9e3b0ea5df6838..76de413b3e6033 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -250,7 +250,10 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
   tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map,
                  gpu_place, true);
   imperative::BasicEngine engine;
-  engine.Init(reduce_sum_out.get());
+
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{reduce_sum_out};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
   framework::LoDTensor rlt;
@@ -376,8 +379,10 @@ TEST(test_tracer, test_var_without_grad_var) {
   ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
   ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL);
 
+  std::vector<std::shared_ptr<imperative::VarBase>> tensors{vout};
+  std::vector<std::shared_ptr<imperative::VarBase>> grad_tensors{nullptr};
   imperative::BasicEngine engine;
-  engine.Init(vout.get());
+  engine.Init(tensors, grad_tensors);
   engine.Execute();
 
   // check the grad
diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h
index b42f25dcc88001..7d287c9829104a 100644
--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -27,8 +27,8 @@
 namespace paddle {
 namespace imperative {
 
-class InteriorVarHookPipeline;
-class LeafVarHookPipeline;
+class VariableWrapperHook;
+class InplaceVariableWrapperHook;
 class VarBase;
 class GradOpNode;
 
@@ -193,42 +193,6 @@ class VariableWrapper {
     }
   }
 
-  /* Hook related method: only can be call by GradVarBase */
-
-  bool HasInteriorHooks() const { return interior_hooks_ != nullptr; }
-
-  bool HasLeafHooks() const { return leaf_hooks_ != nullptr; }
-
-  void AddGradVarInteriorHook(std::unique_ptr<OpBasePreHook>&& hook) {
-    auto interior_hooks = GetGradVarInteriorHooksSafely();
-    interior_hooks->add_hook(std::move(hook));
-  }
-
-  void AddGradVarLeafHook(std::unique_ptr<GradAccumulatorPostHook>&& hook) {
-    auto leaf_hooks = GetGradVarLeafHooksSafely();
-    leaf_hooks->add_hook(std::move(hook));
-  }
-
-  void AddGradVarLeafBackwardHook(
-      std::unique_ptr<GradAccumulatorPostHook>&& hook) {
-    auto leaf_hooks = GetGradVarLeafHooksSafely();
-    leaf_hooks->add_backward_hook(std::move(hook));
-  }
-
-  const std::shared_ptr<InteriorVarHookPipeline>& GetInteriorHooks() const {
-    return interior_hooks_;
-  }
-
-  std::shared_ptr<InteriorVarHookPipeline>& GetInteriorHooks() {
-    return interior_hooks_;
-  }
-
-  const std::shared_ptr<LeafVarHookPipeline>& GetLeafHooks() const {
-    return leaf_hooks_;
-  }
-
-  std::shared_ptr<LeafVarHookPipeline>& GetLeafHooks() { return leaf_hooks_; }
-
   uint32_t InplaceVersionSnapshot() const { return inplace_version_snapshot_; }
 
   void ResetInplaceVersion() {
@@ -255,6 +219,38 @@ class VariableWrapper {
     return;
   }
 
+  /* Hook related methods */
+  bool HasHook() const { return !hooks_.empty(); }
+
+  bool HasMutableHook() const { return !mutable_hooks_.empty(); }
+
+  int64_t AddHook(std::shared_ptr<VariableWrapperHook>&& hook) {
+    hooks_.emplace(next_hook_id_, std::move(hook));
+    return next_hook_id_++;
+  }
+
+  bool RemoveHook(const int64_t& hook_id) {
+    auto remove_cnt = hooks_.erase(hook_id);
+    if (remove_cnt == 0) {
+      return false;
+    }
+    return true;
+  }
+
+  const std::map<int64_t, std::shared_ptr<VariableWrapperHook>>& GetHooks()
+      const {
+    return hooks_;
+  }
+
+  void AddMutableHook(std::shared_ptr<InplaceVariableWrapperHook>&& hook) {
+    mutable_hooks_.emplace_back(std::move(hook));
+  }
+
+  const std::vector<std::shared_ptr<InplaceVariableWrapperHook>>&
+  GetMutableHooks() const {
+    return mutable_hooks_;
+  }
+
  private:
   void SetGradVar(const std::shared_ptr<VariableWrapper>& var) {
     auto shared_var = grad_var_.lock();
@@ -289,41 +285,6 @@ class VariableWrapper {
     }
   }
 
-  /* Hook related private methods */
-  std::shared_ptr<VariableWrapper> GetGradVarSafely() const {
-    auto shared_grad_var = grad_var_.lock();
-    PADDLE_ENFORCE_NOT_NULL(
-        shared_grad_var,
-        platform::errors::PermissionDenied(
-            "Cannot add gradient hook on Tensor without gradient."));
-    return shared_grad_var;
-  }
-
-  std::shared_ptr<InteriorVarHookPipeline>& GetGradVarInteriorHooksSafely() {
-    auto shared_grad_var = GetGradVarSafely();
-    PADDLE_ENFORCE_EQ(HasGradNode(), true,
-                      platform::errors::PermissionDenied(
-                          "Only interior Tensor in backward can register "
-                          "interior gradient hook."));
-    if (shared_grad_var->interior_hooks_ == nullptr) {
-      shared_grad_var->interior_hooks_ =
-          std::make_shared<InteriorVarHookPipeline>();
-    }
-    return shared_grad_var->interior_hooks_;
-  }
-
-  std::shared_ptr<LeafVarHookPipeline>& GetGradVarLeafHooksSafely() {
-    auto shared_grad_var = GetGradVarSafely();
-    PADDLE_ENFORCE_EQ(
-        HasGradNode(), false,
-        platform::errors::PermissionDenied(
-            "Only leaf Tensor in backward can register leaf gradient hook."));
-    if (shared_grad_var->leaf_hooks_ == nullptr) {
-      shared_grad_var->leaf_hooks_ = std::make_shared<LeafVarHookPipeline>();
-    }
-    return shared_grad_var->leaf_hooks_;
-  }
-
  private:
   framework::Variable var_;
   std::string name_;
@@ -358,11 +319,14 @@ class VariableWrapper {
   // isn't need
   bool is_empty_{false};
 
-  // NOTE: only grad var can hold hooks now
-  // only interior var can hold interior hooks
-  std::shared_ptr<InteriorVarHookPipeline> interior_hooks_;
-  // only leaf var can hold leaf hooks
-  std::shared_ptr<LeafVarHookPipeline> leaf_hooks_;
+  // NOTE(chenweihang): only grad var can hold hooks now
+  int64_t next_hook_id_{0};
+  // Hooks used to register hook for grad var, support adding and removing,
+  // key is the accumulated int64_t value
+  std::map<int64_t, std::shared_ptr<VariableWrapperHook>> hooks_;
+  // Hooks executed after the execution of the entire backward process is over,
+  // currently only supported for reducing in distributed training
+  std::vector<std::shared_ptr<InplaceVariableWrapperHook>> mutable_hooks_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 9a4637306bb359..03f86cc7ba6de6 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -57,11 +57,9 @@ if(WITH_TESTING)
   if (NOT APPLE AND NOT WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_inference_shared
       ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
-    set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
   elseif(WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
       ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
-    set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
   endif()
 
 endif()
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 21ef3b2312ff6d..4b6c746d57525a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1192,6 +1192,7 @@ USE_TRT_CONVERTER(scale);
 USE_TRT_CONVERTER(stack);
 USE_TRT_CONVERTER(clip);
 USE_TRT_CONVERTER(gather);
+USE_TRT_CONVERTER(anchor_generator);
 USE_TRT_CONVERTER(yolo_box);
 USE_TRT_CONVERTER(roi_align);
 USE_TRT_CONVERTER(affine_channel);
diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh
index 0d9f3d2aa237ac..c265721db57752 100755
--- a/paddle/fluid/inference/api/demo_ci/clean.sh
+++ b/paddle/fluid/inference/api/demo_ci/clean.sh
@@ -1,3 +1,17 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -x
 cd `dirname $0`
 rm -rf build/ data/
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 3f79230094241c..3820ac5d7cc246 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -6,6 +6,7 @@ nv_library(tensorrt_converter
                 shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc
                 emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
                 gather_op.cc
+                anchor_generator_op.cc
                 yolo_box_op.cc
                 roi_align_op.cc
                 affine_channel_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc b/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc
new file mode 100644
index 00000000000000..56aab9785c90f3
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/* Anchor Generator Op */
+class AnchorGeneratorOpConverter : public OpConverter {
+ public:
+  void operator()(const paddle::framework::proto::OpDesc& op,
+                  const paddle::framework::Scope& scope,
+                  bool test_mode) override {
+    VLOG(3) << "convert a fluid anchor generator op to tensorrt plugin";
+    framework::OpDesc op_desc(op, nullptr);
+    std::string input_name = op_desc.Input("Input").front();
+    std::string anchor_name = op_desc.Output("Anchors").front();
+    std::string variance_name = op_desc.Output("Variances").front();
+
+    auto* input = engine_->GetITensor(input_name);
+    const auto input_dims = input->getDimensions();  // C, H, W
+    std::vector<std::string> output_names{anchor_name, variance_name};
+
+    const auto anchor_sizes =
+        BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("anchor_sizes"));
+    const auto aspect_ratios =
+        BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("aspect_ratios"));
+    const auto stride =
+        BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("stride"));
+    const auto variances =
+        BOOST_GET_CONST(std::vector<float>, op_desc.GetAttr("variances"));
+    const auto offset = BOOST_GET_CONST(float, op_desc.GetAttr("offset"));
+    const int num_anchors = aspect_ratios.size() * anchor_sizes.size();
+    bool is_dynamic = engine_->with_dynamic_shape();
+    const auto height = input_dims.d[1];
+    const auto width = input_dims.d[2];
+    const int box_num = width * height * num_anchors;
+    const nvinfer1::DataType data_type = nvinfer1::DataType::kFLOAT;
+
+    nvinfer1::IPluginV2* anchor_generator_plugin = nullptr;
+    if (is_dynamic) {
+      anchor_generator_plugin = new plugin::AnchorGeneratorPluginDynamic(
+          data_type, anchor_sizes, aspect_ratios, stride, variances, offset,
+          num_anchors);
+    } else {
+      anchor_generator_plugin = new plugin::AnchorGeneratorPlugin(
+          data_type, anchor_sizes, aspect_ratios, stride, variances, offset,
+          height, width, num_anchors, box_num);
+    }
+
+    std::vector<nvinfer1::ITensor*> anchor_generator_inputs{input};
+    auto* anchor_generator_layer = engine_->network()->addPluginV2(
+        anchor_generator_inputs.data(), anchor_generator_inputs.size(),
+        *anchor_generator_plugin);
+
+    RreplenishLayerAndOutput(anchor_generator_layer, "anchor_generator",
+                             output_names, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(anchor_generator, AnchorGeneratorOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index dfadb28a6520f9..74057addecd1f9 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -251,7 +251,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
 #if IS_TRT_VERSION_GE(6000)
         plugin::ElementwisePluginDynamic* plugin =
             new plugin::ElementwisePluginDynamic(op_type_, axis);
-        layer = engine_->AddPluginV2(itensors.data(), 2, plugin);
+        layer = engine_->AddDynamicPlugin(itensors.data(), 2, plugin);
 #else
         PADDLE_THROW(platform::errors::Fatal(
             "You are running the TRT Dynamic Shape mode, need to confirm that "
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index 7f8843a3f67d05..957dfe03698981 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -169,7 +169,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
         plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
             input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
             eps, with_fp16);
-        layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
+        layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin);
         auto output_name = op_desc.Output("Out")[0];
         RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},
                                  test_mode);
diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
index 4c9996ca02cad4..ca5b6a8b52e797 100644
--- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
@@ -64,7 +64,7 @@ class GeluOpConverter : public OpConverter {
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::GeluPluginDynamic* plugin =
           new plugin::GeluPluginDynamic(with_fp16);
-      layer = engine_->AddPluginV2(&input, input_num, plugin);
+      layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
 #else
       PADDLE_THROW(platform::errors::Fatal(
           "You are running the TRT Dynamic Shape mode, need to confirm that "
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index 8ce46a19d4b06e..20086465491326 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -227,7 +227,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
         plugin::DynamicPluginTensorRT* plugin =
             new plugin::QkvToContextPluginDynamic(hidden_in, head_number,
                                                   head_size, scale, with_fp16);
-        layer = engine_->AddPluginV2(plugin_inputs.data(), 2, plugin);
+        layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin);
       }
     } else {
       PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index aa4e54b5845722..c10072602d7c51 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -147,7 +147,7 @@ class Pool2dOpConverter : public OpConverter {
         plugin::PoolPluginDynamic *plugin =
             new plugin::PoolPluginDynamic(ceil_mode, pool_type, adaptive, ksize,
                                           strides, paddings, global_pooling);
-        layer = engine_->AddPluginV2(&input1, 1, plugin);
+        layer = engine_->AddDynamicPlugin(&input1, 1, plugin);
 #endif
       }
       auto output_name = op_desc.Output("Out")[0];
diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
index 5e881ecbbc4e2c..74d77d8be44937 100644
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -65,7 +65,7 @@ class PReluOpConverter : public OpConverter {
 #if IS_TRT_VERSION_GE(6000)
       plugin::PReluPluginDynamic* plugin = new plugin::PReluPluginDynamic(
           alpha_data, alpha_tensor_temp->numel(), mode);
-      layer = engine_->AddPluginV2(&input, input_num, plugin);
+      layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
 #else
       PADDLE_THROW(platform::errors::Fatal(
           "You are running the TRT Dynamic Shape mode, need to confirm that "
diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
index 2e4a4e6120d2d8..3db7709acc22d0 100644
--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@@ -90,7 +90,7 @@ class SkipLayerNormOpConverter : public OpConverter {
         plugin::SkipLayerNormPluginDynamic* plugin =
             new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size,
                                                    scale_size, eps, with_fp16);
-        layer = engine_->AddPluginV2(inputs.data(), 2, plugin);
+        layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin);
       }
     } else {
       PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
index 0bd2b8c9bf5eef..38521d256419d0 100644
--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -90,14 +90,14 @@ class SliceOpConverter : public OpConverter {
         // bool ban_fp16 = engine_->disable_trt_plugin_fp16();
         plugin::SpecialSlicePluginDynamic* plugin =
             new plugin::SpecialSlicePluginDynamic();
-        layer = engine_->AddPluginV2(plugin_inputs.data(), plugin_inputs.size(),
-                                     plugin);
+        layer = engine_->AddDynamicPlugin(plugin_inputs.data(),
+                                          plugin_inputs.size(), plugin);
       } else {
         bool with_fp16 =
             engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
         plugin::SlicePluginDynamic* plugin =
             new plugin::SlicePluginDynamic(starts, ends, axes, with_fp16);
-        layer = engine_->AddPluginV2(&input, 1, plugin);
+        layer = engine_->AddDynamicPlugin(&input, 1, plugin);
       }
 #else
       PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 5d494c2093b2a9..75b317e7bfd90e 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -90,7 +90,7 @@ class SplitOpConverter : public OpConverter {
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::SplitPluginDynamic* plugin =
           new plugin::SplitPluginDynamic(axis, output_lengths, with_fp16);
-      layer = engine_->AddPluginV2(&input, input_num, plugin);
+      layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
 #else
       PADDLE_THROW(platform::errors::Fatal(
           "You are running the TRT Dynamic Shape mode, need to confirm that "
diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
index 1c971fa12e27e8..d538c58879d781 100644
--- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
@@ -59,7 +59,7 @@ class StackOpConverter : public OpConverter {
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::StackPluginDynamic* plugin =
           new plugin::StackPluginDynamic(axis, input_num, with_fp16);
-      layer = engine_->AddPluginV2(inputs, input_num, plugin);
+      layer = engine_->AddDynamicPlugin(inputs, input_num, plugin);
       assert(layer != nullptr);
 #else
       PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
index 25944a2fead6cd..b2e394d14eba23 100644
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
@@ -65,7 +65,7 @@ class SwishOpConverter : public OpConverter {
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::SwishPluginDynamic* plugin =
           new plugin::SwishPluginDynamic(beta, with_fp16);
-      layer = engine_->AddPluginV2(&input, input_num, plugin);
+      layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
 #else
       PADDLE_THROW(platform::errors::Fatal(
           "You are running the TRT Dynamic Shape mode, need to confirm that "
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index de2924824f09de..2358e1ef976cdb 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -378,9 +378,9 @@ class TensorRTEngine {
   bool with_dynamic_shape() { return with_dynamic_shape_; }
 
 #if IS_TRT_VERSION_GE(6000)
-  nvinfer1::IPluginV2Layer* AddPluginV2(nvinfer1::ITensor* const* inputs,
-                                        int num_inputs,
-                                        plugin::DynamicPluginTensorRT* plugin) {
+  nvinfer1::IPluginV2Layer* AddDynamicPlugin(
+      nvinfer1::ITensor* const* inputs, int num_inputs,
+      plugin::DynamicPluginTensorRT* plugin) {
     owned_pluginv2_.emplace_back(plugin);
     return network()->addPluginV2(inputs, num_inputs, *plugin);
   }
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index c95912a931e0bc..b681b098c8c765 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -116,6 +116,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "affine_channel",
       "multiclass_nms",
       "nearest_interp",
+      "anchor_generator",
   };
 };
 
@@ -205,7 +206,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
           (desc.HasAttr("class_num") && desc.HasAttr("anchors") &&
            desc.HasAttr("downsample_ratio") && desc.HasAttr("conf_thresh") &&
            desc.HasAttr("clip_bbox") && desc.HasAttr("scale_x_y"));
-      return has_attrs;
+      if (!has_attrs) return false;
     }
 
     if (op_type == "affine_channel") {
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index b4e948edd8a6bb..1804e6c5571d3a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -5,6 +5,7 @@ nv_library(tensorrt_plugin
            instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
            qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu
            hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu
+           anchor_generator_op_plugin.cu
            yolo_box_op_plugin.cu
            roi_align_op_plugin.cu
            DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
new file mode 100644
index 00000000000000..01ee86ceb48a9e
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
@@ -0,0 +1,566 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+#include <algorithm>
+#include <cassert>
+
+#include "paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+
+#include "paddle/fluid/operators/detection/anchor_generator_op.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#define PrepareParamsOnDevice()                                          \
+  constexpr int data_size = 4;                                           \
+  cudaMalloc(&anchor_sizes_device_, anchor_sizes_.size() * data_size);   \
+  cudaMalloc(&aspect_ratios_device_, aspect_ratios_.size() * data_size); \
+  cudaMalloc(&stride_device_, stride_.size() * data_size);               \
+  cudaMalloc(&variances_device_, variances_.size() * data_size);         \
+  cudaMemcpy(anchor_sizes_device_, anchor_sizes_.data(),                 \
+             anchor_sizes_.size() * data_size, cudaMemcpyHostToDevice);  \
+  cudaMemcpy(aspect_ratios_device_, aspect_ratios_.data(),               \
+             aspect_ratios_.size() * data_size, cudaMemcpyHostToDevice); \
+  cudaMemcpy(stride_device_, stride_.data(), stride_.size() * data_size, \
+             cudaMemcpyHostToDevice);                                    \
+  cudaMemcpy(variances_device_, variances_.data(),                       \
+             variances_.size() * data_size, cudaMemcpyHostToDevice);
+
+AnchorGeneratorPlugin::AnchorGeneratorPlugin(
+    const nvinfer1::DataType data_type, const std::vector<float>& anchor_sizes,
+    const std::vector<float>& aspect_ratios, const std::vector<float>& stride,
+    const std::vector<float>& variances, const float offset, const int height,
+    const int width, const int num_anchors, const int box_num)
+    : data_type_(data_type),
+      anchor_sizes_(anchor_sizes),
+      aspect_ratios_(aspect_ratios),
+      stride_(stride),
+      variances_(variances),
+      offset_(offset),
+      height_(height),
+      width_(width),
+      num_anchors_(num_anchors),
+      box_num_(box_num) {
+  // anchors must be float32, which is the generator proposals' input
+  PADDLE_ENFORCE_EQ(data_type_, nvinfer1::DataType::kFLOAT,
+                    platform::errors::InvalidArgument(
+                        "TRT anchor generator plugin only accepts float32."));
+  PADDLE_ENFORCE_GE(height_, 0,
+                    platform::errors::InvalidArgument(
+                        "TRT anchor generator plugin only accepts height "
+                        "greater than 0, but receive height = %d.",
+                        height_));
+  PADDLE_ENFORCE_GE(width_, 0,
+                    platform::errors::InvalidArgument(
+                        "TRT anchor generator plugin only accepts width "
+                        "greater than 0, but receive width = %d.",
+                        width_));
+  PADDLE_ENFORCE_GE(
+      num_anchors_, 0,
+      platform::errors::InvalidArgument(
+          "TRT anchor generator plugin only accepts number of anchors greater "
+          "than 0, but receive number of anchors = %d.",
+          num_anchors_));
+  PADDLE_ENFORCE_GE(box_num_, 0,
+                    platform::errors::InvalidArgument(
+                        "TRT anchor generator plugin only accepts box_num "
+                        "greater than 0, but receive box_num = %d.",
+                        box_num_));
+  PrepareParamsOnDevice();
+}
+
+AnchorGeneratorPlugin::~AnchorGeneratorPlugin() {
+  auto release_device_ptr = [](void* ptr) {
+    if (ptr) {
+      cudaFree(ptr);
+      ptr = nullptr;
+    }
+  };
+  release_device_ptr(anchor_sizes_device_);
+  release_device_ptr(aspect_ratios_device_);
+  release_device_ptr(stride_device_);
+  release_device_ptr(variances_device_);
+}
+
+AnchorGeneratorPlugin::AnchorGeneratorPlugin(const void* data, size_t length) {
+  DeserializeValue(&data, &length, &data_type_);
+  DeserializeValue(&data, &length, &anchor_sizes_);
+  DeserializeValue(&data, &length, &aspect_ratios_);
+  DeserializeValue(&data, &length, &stride_);
+  DeserializeValue(&data, &length, &variances_);
+  DeserializeValue(&data, &length, &offset_);
+  DeserializeValue(&data, &length, &height_);
+  DeserializeValue(&data, &length, &width_);
+  DeserializeValue(&data, &length, &num_anchors_);
+  DeserializeValue(&data, &length, &box_num_);
+  PrepareParamsOnDevice();
+}
+
+const char* AnchorGeneratorPlugin::getPluginType() const {
+  return "anchor_generator_plugin";
+}
+
+const char* AnchorGeneratorPlugin::getPluginVersion() const { return "1"; }
+
+int AnchorGeneratorPlugin::getNbOutputs() const { return 2; }
+
+nvinfer1::Dims AnchorGeneratorPlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims* inputs, int nb_input_dims) {
+  nvinfer1::Dims dims{};
+  dims.nbDims = 4;
+  dims.d[0] = height_;
+  dims.d[1] = width_;
+  dims.d[2] = num_anchors_;
+  dims.d[3] = 4;
+  return dims;
+}
+
+bool AnchorGeneratorPlugin::supportsFormat(
+    nvinfer1::DataType type, nvinfer1::TensorFormat format) const {
+  // static shape plugin can't support different type between input/out
+  // it may cause addition overhead in half mode
+  return (type == data_type_ && format == nvinfer1::TensorFormat::kLINEAR);
+}
+
+size_t AnchorGeneratorPlugin::getWorkspaceSize(int max_batch_size) const {
+  return 0;
+}
+
+template <typename T>
+int AnchorGeneratorPlugin::enqueue_impl(int batch_size,
+                                        const void* const* inputs,
+                                        void** outputs, void* workspace,
+                                        cudaStream_t stream) {
+  const int block = 512;
+  const int gen_anchor_grid = (box_num_ + block - 1) / block;
+  T* anchors = static_cast<T*>(outputs[0]);
+  T* vars = static_cast<T*>(outputs[1]);
+  const T* anchor_sizes_device = static_cast<const T*>(anchor_sizes_device_);
+  const T* aspect_ratios_device = static_cast<const T*>(aspect_ratios_device_);
+  const T* stride_device = static_cast<const T*>(stride_device_);
+  const T* variances_device = static_cast<const T*>(variances_device_);
+  paddle::operators::GenAnchors<T><<<gen_anchor_grid, block, 0, stream>>>(
+      anchors, aspect_ratios_device, aspect_ratios_.size(), anchor_sizes_device,
+      anchor_sizes_.size(), stride_device, stride_.size(), height_, width_,
+      offset_);
+  const int var_grid = (box_num_ * 4 + block - 1) / block;
+  paddle::operators::SetVariance<T><<<var_grid, block, 0, stream>>>(
+      vars, variances_device, variances_.size(), box_num_ * 4);
+  return cudaGetLastError() != cudaSuccess;
+}
+
+int AnchorGeneratorPlugin::enqueue(int batch_size, const void* const* inputs,
+                                   void** outputs, void* workspace,
+                                   cudaStream_t stream) {
+  return enqueue_impl<float>(batch_size, inputs, outputs, workspace, stream);
+}
+
+int AnchorGeneratorPlugin::initialize() { return 0; }
+
+void AnchorGeneratorPlugin::terminate() {}
+
+size_t AnchorGeneratorPlugin::getSerializationSize() const {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(data_type_);
+  serialize_size += SerializedSize(anchor_sizes_);
+  serialize_size += SerializedSize(aspect_ratios_);
+  serialize_size += SerializedSize(stride_);
+  serialize_size += SerializedSize(variances_);
+  serialize_size += SerializedSize(offset_);
+  serialize_size += SerializedSize(height_);
+  serialize_size += SerializedSize(width_);
+  serialize_size += SerializedSize(num_anchors_);
+  serialize_size += SerializedSize(box_num_);
+  return serialize_size;
+}
+
+void AnchorGeneratorPlugin::serialize(void* buffer) const {
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, anchor_sizes_);
+  SerializeValue(&buffer, aspect_ratios_);
+  SerializeValue(&buffer, stride_);
+  SerializeValue(&buffer, variances_);
+  SerializeValue(&buffer, offset_);
+  SerializeValue(&buffer, height_);
+  SerializeValue(&buffer, width_);
+  SerializeValue(&buffer, num_anchors_);
+  SerializeValue(&buffer, box_num_);
+}
+
+void AnchorGeneratorPlugin::destroy() {}
+
+void AnchorGeneratorPlugin::setPluginNamespace(const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* AnchorGeneratorPlugin::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+nvinfer1::DataType AnchorGeneratorPlugin::getOutputDataType(
+    int index, const nvinfer1::DataType* input_type, int nb_inputs) const {
+  return data_type_;
+}
+
+bool AnchorGeneratorPlugin::isOutputBroadcastAcrossBatch(
+    int output_index, const bool* input_is_broadcast, int nb_inputs) const {
+  return true;
+}
+
+bool AnchorGeneratorPlugin::canBroadcastInputAcrossBatch(
+    int input_index) const {
+  return false;
+}
+
+void AnchorGeneratorPlugin::configurePlugin(
+    const nvinfer1::Dims* input_dims, int nb_inputs,
+    const nvinfer1::Dims* output_dims, int nb_outputs,
+    const nvinfer1::DataType* input_types,
+    const nvinfer1::DataType* output_types, const bool* input_is_broadcast,
+    const bool* output_is_broadcast, nvinfer1::PluginFormat float_format,
+    int max_batct_size) {}
+
+nvinfer1::IPluginV2Ext* AnchorGeneratorPlugin::clone() const {
+  auto plugin = new AnchorGeneratorPlugin(
+      data_type_, anchor_sizes_, aspect_ratios_, stride_, variances_, offset_,
+      height_, width_, num_anchors_, box_num_);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+void AnchorGeneratorPluginCreator::setPluginNamespace(
+    const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* AnchorGeneratorPluginCreator::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+const char* AnchorGeneratorPluginCreator::getPluginName() const {
+  return "anchor_generator_plugin";
+}
+
+const char* AnchorGeneratorPluginCreator::getPluginVersion() const {
+  return "1";
+}
+
+const nvinfer1::PluginFieldCollection*
+AnchorGeneratorPluginCreator::getFieldNames() {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2Ext* AnchorGeneratorPluginCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  const nvinfer1::PluginField* fields = fc->fields;
+  int type_id = -1;
+  std::vector<float> anchor_sizes, aspect_ratios, stride, variances;
+  float offset = .5;
+  int height = -1, width = -1;
+  int num_anchors = -1;
+  int box_num = -1;
+
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const std::string field_name(fc->fields[i].name);
+    const auto length = fc->fields[i].length;
+    if (field_name.compare("type_id") == 0) {
+      type_id = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("anchor_sizes")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      anchor_sizes.insert(anchor_sizes.end(), data, data + length);
+    } else if (field_name.compare("aspect_ratios")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      aspect_ratios.insert(aspect_ratios.end(), data, data + length);
+    } else if (field_name.compare("stride")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      stride.insert(stride.end(), data, data + length);
+    } else if (field_name.compare("variances")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      variances.insert(variances.end(), data, data + length);
+    } else if (field_name.compare("offset")) {
+      offset = *static_cast<const float*>(fc->fields[i].data);
+    } else if (field_name.compare("height")) {
+      height = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("width")) {
+      width = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("num_anchors")) {
+      num_anchors = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("box_num")) {
+      box_num = *static_cast<const int*>(fc->fields[i].data);
+    } else {
+      assert(false && "unknown plugin field name.");
+    }
+  }
+  return new AnchorGeneratorPlugin(nvinfer1::DataType::kFLOAT, anchor_sizes,
+                                   aspect_ratios, stride, variances, offset,
+                                   height, width, num_anchors, box_num);
+}
+
+nvinfer1::IPluginV2Ext* AnchorGeneratorPluginCreator::deserializePlugin(
+    const char* name, const void* serial_data, size_t serial_length) {
+  auto plugin = new AnchorGeneratorPlugin(serial_data, serial_length);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+#if IS_TRT_VERSION_GE(6000)
+AnchorGeneratorPluginDynamic::AnchorGeneratorPluginDynamic(
+    const nvinfer1::DataType data_type, const std::vector<float>& anchor_sizes,
+    const std::vector<float>& aspect_ratios, const std::vector<float>& stride,
+    const std::vector<float>& variances, const float offset,
+    const int num_anchors)
+    : data_type_(data_type),
+      anchor_sizes_(anchor_sizes),
+      aspect_ratios_(aspect_ratios),
+      stride_(stride),
+      variances_(variances),
+      offset_(offset),
+      num_anchors_(num_anchors) {
+  // data_type_ is used to determine the output data type
+  // data_type_ can only be float32
+  // height, width, num_anchors are calculated at configurePlugin
+  PADDLE_ENFORCE_EQ(data_type_, nvinfer1::DataType::kFLOAT,
+                    platform::errors::InvalidArgument(
+                        "TRT anchor generator plugin only accepts float32."));
+  PADDLE_ENFORCE_GE(
+      num_anchors_, 0,
+      platform::errors::InvalidArgument(
+          "TRT anchor generator plugin only accepts number of anchors greater "
+          "than 0, but receive number of anchors = %d.",
+          num_anchors_));
+  PrepareParamsOnDevice();
+}
+
+AnchorGeneratorPluginDynamic::~AnchorGeneratorPluginDynamic() {
+  auto release_device_ptr = [](void* ptr) {
+    if (ptr) {
+      cudaFree(ptr);
+      ptr = nullptr;
+    }
+  };
+  release_device_ptr(anchor_sizes_device_);
+  release_device_ptr(aspect_ratios_device_);
+  release_device_ptr(stride_device_);
+  release_device_ptr(variances_device_);
+}
+
+AnchorGeneratorPluginDynamic::AnchorGeneratorPluginDynamic(void const* data,
+                                                           size_t length) {
+  DeserializeValue(&data, &length, &data_type_);
+  DeserializeValue(&data, &length, &anchor_sizes_);
+  DeserializeValue(&data, &length, &aspect_ratios_);
+  DeserializeValue(&data, &length, &stride_);
+  DeserializeValue(&data, &length, &variances_);
+  DeserializeValue(&data, &length, &offset_);
+  DeserializeValue(&data, &length, &num_anchors_);
+  PrepareParamsOnDevice();
+}
+
+nvinfer1::IPluginV2DynamicExt* AnchorGeneratorPluginDynamic::clone() const {
+  auto plugin = new AnchorGeneratorPluginDynamic(
+      data_type_, anchor_sizes_, aspect_ratios_, stride_, variances_, offset_,
+      num_anchors_);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+
+nvinfer1::DimsExprs AnchorGeneratorPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) {
+  nvinfer1::DimsExprs ret{};
+  ret.nbDims = 4;
+  ret.d[0] = inputs[0].d[2];  // feature height
+  ret.d[1] = inputs[0].d[3];  // feature width
+  ret.d[2] = exprBuilder.constant(num_anchors_);
+  ret.d[3] = exprBuilder.constant(4);
+  return ret;
+}
+
+bool AnchorGeneratorPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
+    int nbOutputs) {
+  // input can be any, doesn't matter
+  // anchor generator doesn't read input raw data, only need the shape info
+  auto type = inOut[pos].type;
+  auto format = inOut[pos].format;
+#if IS_TRT_VERSION_GE(7234)
+  if (pos == 0) return true;
+#else
+  if (pos == 0) return format == nvinfer1::TensorFormat::kLINEAR;
+#endif
+  return (type == nvinfer1::DataType::kFLOAT &&
+          format == nvinfer1::TensorFormat::kLINEAR);
+}
+
+void AnchorGeneratorPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+
+size_t AnchorGeneratorPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+  return 0;
+}
+
+template <typename T>
+int AnchorGeneratorPluginDynamic::enqueue_impl(
+    const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  const int height = inputDesc[0].dims.d[2];
+  const int width = inputDesc[0].dims.d[3];
+  const int box_num = height * width * num_anchors_;
+  const int block = 512;
+  const int gen_anchor_grid = (box_num + block - 1) / block;
+  T* anchors = static_cast<T*>(outputs[0]);
+  T* vars = static_cast<T*>(outputs[1]);
+  const T* anchor_sizes_device = static_cast<const T*>(anchor_sizes_device_);
+  const T* aspect_ratios_device = static_cast<const T*>(aspect_ratios_device_);
+  const T* stride_device = static_cast<const T*>(stride_device_);
+  const T* variances_device = static_cast<const T*>(variances_device_);
+  paddle::operators::GenAnchors<T><<<gen_anchor_grid, block, 0, stream>>>(
+      anchors, aspect_ratios_device, aspect_ratios_.size(), anchor_sizes_device,
+      anchor_sizes_.size(), stride_device, stride_.size(), height, width,
+      offset_);
+  const int var_grid = (box_num * 4 + block - 1) / block;
+  paddle::operators::SetVariance<T><<<var_grid, block, 0, stream>>>(
+      vars, variances_device, variances_.size(), box_num * 4);
+  return cudaGetLastError() != cudaSuccess;
+}
+
+int AnchorGeneratorPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  assert(outputDesc[0].type == nvinfer1::DataType::kFLOAT);
+  assert(outputDesc[1].type == nvinfer1::DataType::kFLOAT);
+  return enqueue_impl<float>(inputDesc, outputDesc, inputs, outputs, workspace,
+                             stream);
+}
+
+nvinfer1::DataType AnchorGeneratorPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const {
+  return data_type_;
+}
+
+const char* AnchorGeneratorPluginDynamic::getPluginType() const {
+  return "anchor_generator_plugin_dynamic";
+}
+
+int AnchorGeneratorPluginDynamic::getNbOutputs() const { return 2; }
+
+int AnchorGeneratorPluginDynamic::initialize() { return 0; }
+
+void AnchorGeneratorPluginDynamic::terminate() {}
+
+size_t AnchorGeneratorPluginDynamic::getSerializationSize() const {
+  size_t serialize_size = 0;
+  serialize_size += SerializedSize(data_type_);
+  serialize_size += SerializedSize(anchor_sizes_);
+  serialize_size += SerializedSize(aspect_ratios_);
+  serialize_size += SerializedSize(stride_);
+  serialize_size += SerializedSize(variances_);
+  serialize_size += SerializedSize(offset_);
+  serialize_size += SerializedSize(num_anchors_);
+  return serialize_size;
+}
+
+void AnchorGeneratorPluginDynamic::serialize(void* buffer) const {
+  SerializeValue(&buffer, data_type_);
+  SerializeValue(&buffer, anchor_sizes_);
+  SerializeValue(&buffer, aspect_ratios_);
+  SerializeValue(&buffer, stride_);
+  SerializeValue(&buffer, variances_);
+  SerializeValue(&buffer, offset_);
+  SerializeValue(&buffer, num_anchors_);
+}
+
+void AnchorGeneratorPluginDynamic::destroy() {}
+
+void AnchorGeneratorPluginDynamicCreator::setPluginNamespace(
+    const char* lib_namespace) {
+  namespace_ = std::string(lib_namespace);
+}
+
+const char* AnchorGeneratorPluginDynamicCreator::getPluginNamespace() const {
+  return namespace_.c_str();
+}
+
+const char* AnchorGeneratorPluginDynamicCreator::getPluginName() const {
+  return "anchor_generator_plugin_dynamic";
+}
+
+const char* AnchorGeneratorPluginDynamicCreator::getPluginVersion() const {
+  return "1";
+}
+
+const nvinfer1::PluginFieldCollection*
+AnchorGeneratorPluginDynamicCreator::getFieldNames() {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  const nvinfer1::PluginField* fields = fc->fields;
+  int type_id = -1;
+  std::vector<float> anchor_sizes, aspect_ratios, stride, variances;
+  float offset = .5;
+  int num_anchors = -1;
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const std::string field_name(fc->fields[i].name);
+    const auto length = fc->fields[i].length;
+    if (field_name.compare("type_id") == 0) {
+      type_id = *static_cast<const int*>(fc->fields[i].data);
+    } else if (field_name.compare("anchor_sizes")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      anchor_sizes.insert(anchor_sizes.end(), data, data + length);
+    } else if (field_name.compare("aspect_ratios")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      aspect_ratios.insert(aspect_ratios.end(), data, data + length);
+    } else if (field_name.compare("stride")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      stride.insert(stride.end(), data, data + length);
+    } else if (field_name.compare("variances")) {
+      const auto* data = static_cast<const float*>(fc->fields[i].data);
+      variances.insert(variances.end(), data, data + length);
+    } else if (field_name.compare("offset")) {
+      offset = *static_cast<const float*>(fc->fields[i].data);
+    } else if (field_name.compare("num_anchors")) {
+      num_anchors = *static_cast<const int*>(fc->fields[i].data);
+    } else {
+      assert(false && "unknown plugin field name.");
+    }
+  }
+  return new AnchorGeneratorPluginDynamic(nvinfer1::DataType::kFLOAT,
+                                          anchor_sizes, aspect_ratios, stride,
+                                          variances, offset, num_anchors);
+}
+
+nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::deserializePlugin(
+    const char* name, const void* serial_data, size_t serial_length) {
+  auto plugin = new AnchorGeneratorPluginDynamic(serial_data, serial_length);
+  plugin->setPluginNamespace(namespace_.c_str());
+  return plugin;
+}
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h
new file mode 100644
index 00000000000000..aff0b6a6802f11
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h
@@ -0,0 +1,201 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class AnchorGeneratorPlugin : public nvinfer1::IPluginV2Ext {
+ public:
+  explicit AnchorGeneratorPlugin(
+      const nvinfer1::DataType, const std::vector<float>& anchor_sizes,
+      const std::vector<float>& aspect_ratios, const std::vector<float>& stride,
+      const std::vector<float>& variances, const float offset, const int height,
+      const int width, const int num_anchors, const int box_num);
+  AnchorGeneratorPlugin(const void* data, size_t length);
+  ~AnchorGeneratorPlugin() override;
+  const char* getPluginType() const override;
+  const char* getPluginVersion() const override;
+  int getNbOutputs() const override;
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nb_input_dims) override;
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::TensorFormat format) const override;
+  size_t getWorkspaceSize(int max_batch_size) const override;
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* input_type,
+                                       int nb_inputs) const override;
+  bool isOutputBroadcastAcrossBatch(int output_index,
+                                    const bool* input_is_broadcast,
+                                    int nb_inputs) const override;
+  bool canBroadcastInputAcrossBatch(int input_index) const override;
+  void configurePlugin(const nvinfer1::Dims* input_dims, int nb_inputs,
+                       const nvinfer1::Dims* output_dims, int nb_outputs,
+                       const nvinfer1::DataType* input_types,
+                       const nvinfer1::DataType* output_types,
+                       const bool* input_is_broadcast,
+                       const bool* output_is_broadcast,
+                       nvinfer1::PluginFormat float_format,
+                       int max_batct_size) override;
+  nvinfer1::IPluginV2Ext* clone() const override;
+
+ private:
+  template <typename T>
+  int enqueue_impl(int batch_size, const void* const* inputs, void** outputs,
+                   void* workspace, cudaStream_t stream);
+  nvinfer1::DataType data_type_;
+  std::vector<float> anchor_sizes_;
+  std::vector<float> aspect_ratios_;
+  std::vector<float> stride_;
+  std::vector<float> variances_;
+  float offset_;
+  void* anchor_sizes_device_;
+  void* aspect_ratios_device_;
+  void* stride_device_;
+  void* variances_device_;
+  int height_;
+  int width_;
+  int num_anchors_;
+  int box_num_;
+  std::string namespace_;
+};
+
+class AnchorGeneratorPluginCreator : public nvinfer1::IPluginCreator {
+ public:
+  AnchorGeneratorPluginCreator() = default;
+  ~AnchorGeneratorPluginCreator() override = default;
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+  const char* getPluginName() const override;
+  const char* getPluginVersion() const override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+  nvinfer1::IPluginV2Ext* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                            const void* serial_data,
+                                            size_t serial_length) override;
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+
+REGISTER_TRT_PLUGIN_V2(AnchorGeneratorPluginCreator);
+
+#if IS_TRT_VERSION_GE(6000)
+class AnchorGeneratorPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit AnchorGeneratorPluginDynamic(const nvinfer1::DataType data_type,
+                                        const std::vector<float>& anchor_sizes,
+                                        const std::vector<float>& aspect_ratios,
+                                        const std::vector<float>& stride,
+                                        const std::vector<float>& variances,
+                                        const float offset,
+                                        const int num_anchors);
+  AnchorGeneratorPluginDynamic(void const* data, size_t length);
+  ~AnchorGeneratorPluginDynamic();
+  nvinfer1::IPluginV2DynamicExt* clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+  const char* getPluginType() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  void destroy() override;
+
+ private:
+  template <typename T>
+  int enqueue_impl(const nvinfer1::PluginTensorDesc* inputDesc,
+                   const nvinfer1::PluginTensorDesc* outputDesc,
+                   const void* const* inputs, void* const* outputs,
+                   void* workspace, cudaStream_t stream);
+  nvinfer1::DataType data_type_;
+  std::vector<float> anchor_sizes_;
+  std::vector<float> aspect_ratios_;
+  std::vector<float> stride_;
+  std::vector<float> variances_;
+  float offset_;
+  void* anchor_sizes_device_;
+  void* aspect_ratios_device_;
+  void* stride_device_;
+  void* variances_device_;
+  int num_anchors_;
+  std::string namespace_;
+};
+
+class AnchorGeneratorPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  AnchorGeneratorPluginDynamicCreator() = default;
+  ~AnchorGeneratorPluginDynamicCreator() override = default;
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+  const char* getPluginName() const override;
+  const char* getPluginVersion() const override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+  nvinfer1::IPluginV2Ext* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
+                                            const void* serial_data,
+                                            size_t serial_length) override;
+
+ private:
+  std::string namespace_;
+  nvinfer1::PluginFieldCollection field_collection_;
+};
+REGISTER_TRT_PLUGIN_V2(AnchorGeneratorPluginDynamicCreator);
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
index 49212aae9aa90d..75a1dd85f0f2c4 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
@@ -144,9 +144,9 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT {
   int axis_;
 };
 
-class ElementwisePluginV2Creator : public nvinfer1::IPluginCreator {
+class ElementwisePluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  ElementwisePluginV2Creator() {}
+  ElementwisePluginDynamicCreator() {}
   const char* getPluginName() const override { return "elementwise_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -182,7 +182,7 @@ class ElementwisePluginV2Creator : public nvinfer1::IPluginCreator {
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
 
-REGISTER_TRT_PLUGIN_V2(ElementwisePluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(ElementwisePluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
index 6c8381a750cba9..7de84a8fc49bcc 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@@ -306,9 +306,10 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
   }
 };
 
-class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
+class EmbEltwiseLayernormPluginDynamicCreator
+    : public nvinfer1::IPluginCreator {
  public:
-  EmbEltwiseLayernormPluginV2Creator() {}
+  EmbEltwiseLayernormPluginDynamicCreator() {}
   const char* getPluginName() const override {
     return "fused_embedding_eltwise_layernorm_plugin";
   }
@@ -345,7 +346,7 @@ class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
 
-REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginDynamicCreator);
 
 #endif
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
index 979f600a3a9cea..23e507ee477e1a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
@@ -115,9 +115,9 @@ class GeluPluginDynamic : public DynamicPluginTensorRT {
   void destroy() override { delete this; }
 };
 
-class GeluPluginV2Creator : public nvinfer1::IPluginCreator {
+class GeluPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  GeluPluginV2Creator() {}
+  GeluPluginDynamicCreator() {}
   const char* getPluginName() const override { return "gelu_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -153,7 +153,7 @@ class GeluPluginV2Creator : public nvinfer1::IPluginCreator {
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
 
-REGISTER_TRT_PLUGIN_V2(GeluPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(GeluPluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
index b852f5a454c07c..7147d9855755be 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
@@ -118,9 +118,9 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
   float scale_;
 };
 
-class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator {
+class QkvToContextPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  QkvToContextPluginV2Creator() {}
+  QkvToContextPluginDynamicCreator() {}
   const char* getPluginName() const override { return "qkv_to_context_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -155,7 +155,7 @@ class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator {
   nvinfer1::PluginFieldCollection field_collection_;
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
-REGISTER_TRT_PLUGIN_V2(QkvToContextPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(QkvToContextPluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
index 42c0df41a1b5ef..6e7ed0054f502e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
@@ -364,6 +364,7 @@ RoiAlignPluginDynamicCreator::getFieldNames() {
 nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::createPlugin(
     const char* name, const nvinfer1::PluginFieldCollection* fc) {
   const nvinfer1::PluginField* fields = fc->fields;
+  return nullptr;
 }
 
 nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::deserializePlugin(
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
index 0e457fdc8f4474..ac621784550f2f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@@ -119,9 +119,9 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
   float eps_;
 };
 
-class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator {
+class SkipLayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  SkipLayerNormPluginV2Creator() {}
+  SkipLayerNormPluginDynamicCreator() {}
   const char* getPluginName() const override { return "skip_layernorm_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -156,7 +156,7 @@ class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator {
   nvinfer1::PluginFieldCollection field_collection_;
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
-REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
index 340406c5e7fae8..9d4f9a35c3b6fe 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
@@ -121,9 +121,9 @@ class SlicePluginDynamic : public DynamicPluginTensorRT {
   cudaStream_t copy_stream_;
 };
 
-class SlicePluginV2Creator : public nvinfer1::IPluginCreator {
+class SlicePluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  SlicePluginV2Creator() {}
+  SlicePluginDynamicCreator() {}
   const char* getPluginName() const override { return "slice_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -155,7 +155,7 @@ class SlicePluginV2Creator : public nvinfer1::IPluginCreator {
   nvinfer1::PluginFieldCollection field_collection_;
 };
 
-REGISTER_TRT_PLUGIN_V2(SlicePluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(SlicePluginDynamicCreator);
 
 #endif
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index e43b57357fb64f..1ee895154d6b04 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -193,9 +193,9 @@ class SplitPluginDynamic : public DynamicPluginTensorRT {
   std::vector<int> output_length_;
 };
 
-class SplitPluginV2Creator : public nvinfer1::IPluginCreator {
+class SplitPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  SplitPluginV2Creator() {}
+  SplitPluginDynamicCreator() {}
   const char* getPluginName() const override { return "split_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -231,7 +231,7 @@ class SplitPluginV2Creator : public nvinfer1::IPluginCreator {
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
 
-REGISTER_TRT_PLUGIN_V2(SplitPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(SplitPluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
index 85cc6916238fef..11579aadcc4573 100644
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
@@ -127,9 +127,9 @@ class SwishPluginDynamic : public DynamicPluginTensorRT {
   float beta_;
 };
 
-class SwishPluginV2Creator : public nvinfer1::IPluginCreator {
+class SwishPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
-  SwishPluginV2Creator() {}
+  SwishPluginDynamicCreator() {}
   const char* getPluginName() const override { return "swish_plugin"; }
 
   const char* getPluginVersion() const override { return "1"; }
@@ -165,7 +165,7 @@ class SwishPluginV2Creator : public nvinfer1::IPluginCreator {
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
 
-REGISTER_TRT_PLUGIN_V2(SwishPluginV2Creator);
+REGISTER_TRT_PLUGIN_V2(SwishPluginDynamicCreator);
 #endif
 
 }  // namespace plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
index e1b4c898d212ff..13d07e774036a4 100644
--- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
@@ -283,10 +283,7 @@ void YoloBoxPlugin::serialize(void* buffer) const {
   SerializeValue(&buffer, input_w_);
 }
 
-void YoloBoxPlugin::destroy() {
-  cudaFree(anchors_device_);
-  delete this;
-}
+void YoloBoxPlugin::destroy() {}
 
 void YoloBoxPlugin::setPluginNamespace(const char* lib_namespace) {
   namespace_ = std::string(lib_namespace);
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 377ea376773899..565797d51dd513 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -33,6 +33,8 @@ if (WITH_GPU OR WITH_ROCM)
     set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
 elseif(WITH_XPU)
     set(AllocatorFacadeDeps xpu_info)
+elseif(WITH_ASCEND)
+    set(AllocatorFacadeDeps ascend_npu_info)
 else ()
     set(AllocatorFacadeDeps)
 endif()
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 467a5ff9063a65..ed878727532285 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -10,6 +10,7 @@ file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operators/CMakeLists
 copy_if_different(${pybind_file} ${pybind_file_final})
 
 add_subdirectory(math)
+add_subdirectory(eigen)
 add_subdirectory(controlflow)
 add_subdirectory(detection)
 add_subdirectory(elementwise)
@@ -110,8 +111,9 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_fun
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_cc_function)
 if (WITH_GPU OR WITH_ROCM)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor eigen_cu_function)
 endif()
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer)
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index c6d2fbccd8e84b..04f329088fafe8 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -42,6 +42,10 @@ template <typename T>
 class BaseGPUFunctor {
  public:
   using ELEMENT_TYPE = T;
+
+  using AttrPair = std::vector<std::pair<const char*, float*>>;
+
+  AttrPair GetAttrs() { return AttrPair(); }
 };
 
 /* ========================================================================== */
@@ -57,42 +61,35 @@ class ReluGPUFunctor : public BaseGPUFunctor<T> {
 
   // for relu forward when T is double
   __device__ __forceinline__ typename CudaVecType<T>::type Compute(
-      const typename CudaVecType<T>::type* x);
+      const typename CudaVecType<T>::type in) {
+    // relu forward : out = max(x, 0)
+    return in > zero_ ? in : zero_;
+  }
 
   // when num % vecsize != 0 this func will be used
-  __device__ __forceinline__ T ComputeRemainder(const T x) {
-    return x > zero_ ? x : zero_;
+  __device__ __forceinline__ T ComputeRemainder(const T in) {
+    // relu forward : out = max(x, 0)
+    return in > zero_ ? in : zero_;
   }
 };
 
-template <>
-__device__ __forceinline__ CudaVecType<double>::type
-ReluGPUFunctor<double>::Compute(const CudaVecType<double>::type* x) {
-// relu forward : out = max(x, 0)
-#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
-  return __ldg(x) > zero_ ? __ldg(x) : zero_;
-#else
-  return (*x) > zero_ ? (*x) : zero_;
-#endif
-}
-
 template <>
 __device__ __forceinline__ CudaVecType<float>::type
-ReluGPUFunctor<float>::Compute(const CudaVecType<float>::type* xx) {
-  // relu forward : out = max(xx, 0)
-  return make_float4((xx->x > zero_) * (xx->x), (xx->y > zero_) * (xx->y),
-                     (xx->z > zero_) * (xx->z), (xx->w > zero_) * (xx->w));
+ReluGPUFunctor<float>::Compute(const CudaVecType<float>::type in) {
+  // relu forward : out = max(in, 0)
+  return make_float4((in.x > zero_) * (in.x), (in.y > zero_) * (in.y),
+                     (in.z > zero_) * (in.z), (in.w > zero_) * (in.w));
 }
 
 template <>
 __device__ __forceinline__ CudaVecType<float16>::type
-ReluGPUFunctor<float16>::Compute(const CudaVecType<float16>::type* in) {
+ReluGPUFunctor<float16>::Compute(const CudaVecType<float16>::type in) {
 // relu forward : out = max(in, 0)
 #ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
   const half2 kzero = __float2half2_rn(0.0f);
-  return __hmul2(__hgt2(__ldg(in), kzero), __ldg(in));
+  return __hmul2(__hgt2(in, kzero), in);
 #else
-  const float2 xx = __half22float2(*in);
+  const float2 xx = __half22float2(in);
   return __floats2half2_rn((xx.x > 0.0f) * static_cast<float>(xx.x),
                            (xx.y > 0.0f) * static_cast<float>(xx.y));
 #endif
@@ -112,8 +109,10 @@ class ReluGradGPUFunctor : public BaseGPUFunctor<T> {
 
   // for relu backward when T is double
   __device__ __forceinline__ typename CudaVecType<T>::type Compute(
-      const typename CudaVecType<T>::type* out,
-      const typename CudaVecType<T>::type* dout);
+      const typename CudaVecType<T>::type out,
+      const typename CudaVecType<T>::type dout) {
+    return out > zero_ ? dout : zero_;
+  }
 
   // when num % vecsize != 0 this func will be used
   __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) {
@@ -124,44 +123,132 @@ class ReluGradGPUFunctor : public BaseGPUFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
-template <>
-__device__ __forceinline__ CudaVecType<double>::type
-ReluGradGPUFunctor<double>::Compute(const CudaVecType<double>::type* out,
-                                    const CudaVecType<double>::type* dout) {
-// relu backward : dx = out > 0 ? dout : 0;
-#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
-  return __ldg(out) > zero_ ? __ldg(dout) : zero_;
-#else
-  return (*out) > zero_ ? (*dout) : zero_;
-#endif
-}
-
 template <>
 __device__ __forceinline__ CudaVecType<float>::type
-ReluGradGPUFunctor<float>::Compute(const CudaVecType<float>::type* out,
-                                   const CudaVecType<float>::type* dout) {
+ReluGradGPUFunctor<float>::Compute(const CudaVecType<float>::type out,
+                                   const CudaVecType<float>::type dout) {
   // relu backward : dx = out > 0 ? dout : 0;
-  return make_float4((out->x > zero_) * (dout->x), (out->y > zero_) * (dout->y),
-                     (out->z > zero_) * (dout->z),
-                     (out->w > zero_) * (dout->w));
+  return make_float4((out.x > zero_) * (dout.x), (out.y > zero_) * (dout.y),
+                     (out.z > zero_) * (dout.z), (out.w > zero_) * (dout.w));
 }
 
 template <>
 __device__ __forceinline__ CudaVecType<float16>::type
-ReluGradGPUFunctor<float16>::Compute(const CudaVecType<float16>::type* out,
-                                     const CudaVecType<float16>::type* dout) {
+ReluGradGPUFunctor<float16>::Compute(const CudaVecType<float16>::type out,
+                                     const CudaVecType<float16>::type dout) {
 // relu backward : dx = out > 0 ? dout : 0;
 #ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
   const half2 kzero = __float2half2_rn(0.0f);
-  return __hmul2(__hgt2(__ldg(out), kzero), __ldg(dout));
+  return __hmul2(__hgt2(out, kzero), dout);
 #else
-  const float2 xx = __half22float2(*out);
-  const float2 yy = __half22float2(*dout);
+  const float2 xx = __half22float2(out);
+  const float2 yy = __half22float2(dout);
   return __floats2half2_rn((xx.x > 0.0f) * static_cast<float>(yy.x),
                            (xx.y > 0.0f) * static_cast<float>(yy.y));
 #endif
 }
 
+/* ========================================================================== */
+/* ========================    leaky relu forward    ========================
+ */
+template <typename T>
+class LeakyReluGPUFunctor : public BaseGPUFunctor<T> {
+ private:
+  T zero_;
+  float alpha_;
+
+ public:
+  LeakyReluGPUFunctor() { zero_ = static_cast<T>(0.0f); }
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha_}};
+  }
+  // leakyrelu forward : out = x > 0 ? x : x * alpha
+  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
+      const typename CudaVecType<T>::type in) {
+    return in > zero_ ? in : static_cast<T>(alpha_) * in;
+  }
+
+  __device__ __forceinline__ T ComputeRemainder(const T in) {
+    // leakyrelu forward : out = x > 0 ? x : x * alpha
+    return in > zero_ ? in : static_cast<T>(alpha_) * in;
+  }
+};
+
+template <>
+__device__ __forceinline__ CudaVecType<float>::type
+LeakyReluGPUFunctor<float>::Compute(const CudaVecType<float>::type in) {
+  // leakyrelu forward : out = x > 0 ? x : x * alpha
+  return make_float4((in.x > zero_) ? (in.x) : (in.x) * alpha_,
+                     (in.y > zero_) ? (in.y) : (in.y) * alpha_,
+                     (in.z > zero_) ? (in.z) : (in.z) * alpha_,
+                     (in.w > zero_) ? (in.w) : (in.w) * alpha_);
+}
+
+template <>
+__device__ __forceinline__ CudaVecType<float16>::type
+LeakyReluGPUFunctor<float16>::Compute(const CudaVecType<float16>::type in) {
+  // leakyrelu forward : out = x > 0 ? x : x * alpha
+  const float2 xx = __half22float2(in);
+  return __floats2half2_rn((xx.x > 0.0f) ? xx.x : xx.x * alpha_,
+                           (xx.y > 0.0f) ? xx.y : xx.y * alpha_);
+}
+/* ========================================================================== */
+
+/* ===========================  leaky relu backward   =======================
+ */
+template <typename T>
+class LeakyReluGradGPUFunctor : public BaseGPUFunctor<T> {
+ private:
+  T zero_;
+  float alpha_;
+
+ public:
+  LeakyReluGradGPUFunctor() { zero_ = static_cast<T>(0.0f); }
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha_}};
+  }
+
+  // for leaky relu backward when T is double
+  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
+      const typename CudaVecType<T>::type in,
+      const typename CudaVecType<T>::type dout) {
+    // leakyrelu backward : dx = x > 0 ? dout : alpha * dout
+    return in > zero_ ? dout : static_cast<T>(alpha_) * dout;
+  }
+
+  // when num % vecsize != 0 this func will be used
+  __device__ __forceinline__ T ComputeRemainder(const T in, const T dout) {
+    // leakyrelu backward : dx = x > 0 ? dout : alpha * dout
+    return in > zero_ ? dout : static_cast<T>(alpha_) * dout;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <>
+__device__ __forceinline__ CudaVecType<float>::type
+LeakyReluGradGPUFunctor<float>::Compute(const CudaVecType<float>::type in,
+                                        const CudaVecType<float>::type dout) {
+  // leakyrelu backward : dx = x > 0 ? dout : alpha * dout
+  return make_float4((in.x > zero_) ? (dout.x) : alpha_ * (dout.x),
+                     (in.y > zero_) ? (dout.y) : alpha_ * (dout.y),
+                     (in.z > zero_) ? (dout.z) : alpha_ * (dout.z),
+                     (in.w > zero_) ? (dout.w) : alpha_ * (dout.w));
+}
+
+template <>
+__device__ __forceinline__ CudaVecType<float16>::type LeakyReluGradGPUFunctor<
+    float16>::Compute(const CudaVecType<float16>::type in,
+                      const CudaVecType<float16>::type dout) {
+  // leakyrelu backward : dx = x > 0 ? dout : alpha * dout
+  const float2 xx = __half22float2(in);
+  const float2 yy = __half22float2(dout);
+  return __floats2half2_rn((xx.x > 0.0f) ? yy.x : alpha_ * yy.x,
+                           (xx.y > 0.0f) ? yy.y : alpha_ * yy.y);
+}
+
 /* ========================================================================== */
 
 template <typename T, typename Functor>
@@ -176,14 +263,23 @@ __global__ void ActivationGradKernelVec(const T* forward_data, const T* dout,
   const VecType* in_forward = reinterpret_cast<const VecType*>(forward_data);
   const VecType* in_dout = reinterpret_cast<const VecType*>(dout);
   VecType* out = reinterpret_cast<VecType*>(dx);
-
+  VecType forward_vec, dout_vec;
+  T in_data, dout_data;
   for (int i = idx; i < loop; i += stride) {
-    out[i] = functor.Compute((in_forward + i), (in_dout + i));
+#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
+    forward_vec = __ldg(in_forward + i);
+    dout_vec = __ldg(in_dout + i);
+#else
+    forward_vec = in_forward[i];
+    dout_vec = in_dout[i];
+#endif
+    out[i] = functor.Compute(forward_vec, dout_vec);
   }
 
   while (idx == loop && tail) {
-    dx[num - tail] =
-        functor.ComputeRemainder(forward_data[num - tail], dout[num - tail]);
+    in_data = forward_data[num - tail];
+    dout_data = dout[num - tail];
+    dx[num - tail] = functor.ComputeRemainder(in_data, dout_data);
     --tail;
   }
 }
@@ -199,9 +295,14 @@ __global__ void ActivationkernelVec(const T* src, T* dst, int num,
   int tail = num % vecsize;
   const VecType* in = reinterpret_cast<const VecType*>(src);
   VecType* out = reinterpret_cast<VecType*>(dst);
-
+  VecType x_vec;
   for (int i = idx; i < loop; i += stride) {
-    out[i] = functor.Compute((in + i));
+#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
+    x_vec = __ldg(in + i);
+#else
+    x_vec = in[i];
+#endif
+    out[i] = functor.Compute(x_vec);
   }
 
   while (idx == loop && tail) {
@@ -231,6 +332,10 @@ class ActivationGPUKernel
     block = 256;
 #endif
     Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
     constexpr int vecsize = CudaVecType<T>::vecsize;
     int grid = max((num / vecsize + block - 1) / block, 1);
     auto stream = context.cuda_device_context().stream();
@@ -270,7 +375,12 @@ class ActivationGradGPUKernel
 #ifdef __HIPCC__
     block = 256;
 #endif
+
     Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
     constexpr int vecsize = CudaVecType<T>::vecsize;
     int grid = max((numel / vecsize + block - 1) / block, 1);
     auto stream = context.cuda_device_context().stream();
@@ -300,12 +410,28 @@ namespace plat = paddle::platform;
                                 ops::grad_functor<double>>,                 \
       ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
                                 ops::grad_functor<plat::float16>>);
-
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL);
 
+#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, op_name, functor,             \
+                                       grad_functor)                           \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type, ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,  \
+                                         ops::functor<float>>,                 \
+      ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,            \
+                               ops::functor<double>>,                          \
+      ops::ActivationGPUKernel<plat::CUDADeviceContext,                        \
+                               ops::functor<plat::float16>>);                  \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      act_type##_grad, ops::ActivationGradGPUKernel<plat::CUDADeviceContext,   \
+                                                    ops::grad_functor<float>>, \
+      ops::ActivationGradGPUKernel<plat::CUDADeviceContext,                    \
+                                   ops::grad_functor<double>>,                 \
+      ops::ActivationGradGPUKernel<plat::CUDADeviceContext,                    \
+                                   ops::grad_functor<plat::float16>>);
+
 /* ======================== leaky relu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor,
-                                LeakyReluGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluGPUFunctor,
+                               LeakyReluGradGPUFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     leaky_relu_grad_grad,
@@ -330,21 +456,7 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================    relu register  ============================ */
-REGISTER_OP_CUDA_KERNEL(
-    relu, ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,
-                                   ops::ReluGPUFunctor<float>>,
-    ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,
-                             ops::ReluGPUFunctor<double>>,
-    ops::ActivationGPUKernel<plat::CUDADeviceContext,
-                             ops::ReluGPUFunctor<plat::float16>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    relu_grad, ops::ActivationGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                            ops::ReluGradGPUFunctor<float>>,
-    ops::ActivationGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                 ops::ReluGradGPUFunctor<double>>,
-    ops::ActivationGradGPUKernel<plat::CUDADeviceContext,
-                                 ops::ReluGradGPUFunctor<plat::float16>>);
+REGISTER_ACTIVATION_GPU_KERNEL(relu, Relu, ReluGPUFunctor, ReluGradGPUFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     relu_grad_grad,
diff --git a/paddle/fluid/operators/addmm_op.h b/paddle/fluid/operators/addmm_op.h
index 97e3ed9c1adda0..ecfd10d2fa6fbd 100644
--- a/paddle/fluid/operators/addmm_op.h
+++ b/paddle/fluid/operators/addmm_op.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -32,8 +33,8 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 
-using Array1 = Eigen::DSizes<int64_t, 1>;
-using Array2 = Eigen::DSizes<int64_t, 2>;
+using Array1 = Eigen::DSizes<Eigen::DenseIndex, 1>;
+using Array2 = Eigen::DSizes<Eigen::DenseIndex, 2>;
 
 using Tensor = framework::Tensor;
 
@@ -105,7 +106,8 @@ class AddMMKernel : public framework::OpKernel<T> {
     auto eigen_out = EigenTensor<T, 2>::From(*out);
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    eigen_out.device(place) = eigen_input.broadcast(bcast_dims);
+    EigenBroadcast<std::decay_t<decltype(place)>, T, 2>::Eval(
+        place, eigen_out, eigen_input, bcast_dims);
 
     blas.GEMM(false, false, x_dims[0], y_dims[1], x_dims[1], alpha,
               x->data<T>(), x_dims[1], y->data<T>(), y_dims[1], beta,
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 8920541b9b9dcc..977a208d20e783 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -19,6 +19,12 @@ if(WITH_NCCL OR WITH_RCCL)
     op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
+if(WITH_ASCEND)
+    op_library(gen_nccl_id_op)
+    op_library(c_gen_nccl_id_op)
+endif()
+
+
 if(WITH_GLOO)
     set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
 endif()
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index 1592d809f91e26..7da30f64d1ce39 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -27,6 +27,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
   for (size_t i = 0; i < nccl_ids->size(); ++i) {
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -84,6 +85,21 @@ class CGenNCCLIdOp : public framework::OperatorBase {
   }
 };
 
+#else
+class CGenNCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenNCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {}
+};
+
+#endif
+
 class CGenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index c4abe284d72096..700d1173e2ff68 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -15,40 +15,20 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 
-class CSyncCalcStreamOp : public framework::OperatorBase {
+class CSyncCalcStreamOp : public framework::OperatorWithKernel {
  public:
-  CSyncCalcStreamOp(const std::string& type,
-                    const framework::VariableNameMap& inputs,
-                    const framework::VariableNameMap& outputs,
-                    const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on gpu place only for now."));
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
-    auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
-        platform::DeviceContextPool::Instance().Get(place));
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream()));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream()));
-#endif
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
   }
 };
 
@@ -65,10 +45,36 @@ Call calculation stream synchronization.
   }
 };
 
+template <typename T>
+class CSyncCalcStreamCudaKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+
+    auto place = ctx.GetPlace();
+    auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream()));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream()));
+#endif
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(c_sync_calc_stream, ops::CSyncCalcStreamOp,
-                  ops::CSyncCalcStreamOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream, ops::CSyncCalcStreamOp,
+                             ops::CSyncCalcStreamOpMaker);
+
+REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream,
+                        ops::CSyncCalcStreamCudaKernel<float>);
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
index adf27069f524e4..95b9cd040fe94e 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -14,45 +14,25 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-}  // namespace paddle
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
 namespace paddle {
 namespace operators {
 
-class CSyncCommStreamOp : public framework::OperatorBase {
+class CSyncCommStreamOp : public framework::OperatorWithKernel {
  public:
-  CSyncCommStreamOp(const std::string& type,
-                    const framework::VariableNameMap& inputs,
-                    const framework::VariableNameMap& outputs,
-                    const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
-                      platform::errors::PreconditionNotMet(
-                          "Sync stream op can run on gpu place only for now."));
+  using framework::OperatorWithKernel::OperatorWithKernel;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    int ring_id = Attr<int>("ring_id");
-    auto stream =
-        platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
-#ifdef PADDLE_WITH_RCCL
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
-#else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
-#endif
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
-#endif
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
   }
 };
 
@@ -72,10 +52,38 @@ Call communication stream synchronization.
   }
 };
 
+template <typename T>
+class CSyncCommStreamCudaKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+
+    auto place = ctx.GetPlace();
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto stream =
+        platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
+
+#ifdef PADDLE_WITH_RCCL
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+#endif
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OPERATOR(c_sync_comm_stream, ops::CSyncCommStreamOp,
-                  ops::CSyncCommStreamOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream, ops::CSyncCommStreamOp,
+                             ops::CSyncCommStreamOpMaker);
+
+REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream,
+                        ops::CSyncCommStreamCudaKernel<float>);
diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
index 679713d05bcb40..99a92469e8502b 100644
--- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
@@ -34,6 +34,7 @@ class Scope;
 namespace paddle {
 namespace operators {
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
   for (size_t i = 0; i < nccl_ids->size(); ++i) {
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -194,6 +195,20 @@ class GenNCCLIdOp : public framework::OperatorBase {
   }
 };
 
+#else
+class GenNCCLIdOp : public framework::OperatorBase {
+ public:
+  GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {}
+};
+
+#endif
+
 class GenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 364e3ab8d26c3f..94d1f707b74c2e 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -903,29 +903,19 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
               "and input channel number is %d",
               output->dims()[1], input->dims()[1]));
     }
-    // transform tensor
-    Tensor transformed_input(input->type());
-    Tensor transformed_output(output->type());
-
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(context, input,
-                                             &transformed_input);
-      TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
-
-      ResizeToChannelFirst<DeviceContext, T>(context, output,
-                                             &transformed_output);
-
-    } else {
-      transformed_input = *input;
-      transformed_output = *output;
-    }
 
     // update padding and dilation
-    auto in_dims = transformed_input.dims();
+    auto in_dims = input->dims();
     auto filter_dims = filter.dims();
 
     framework::DDim in_data_dims;
-    in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
+    const framework::DataLayout data_layout =
+        framework::StringToDataLayout(data_format);
+    if (data_layout != framework::DataLayout::kNHWC) {
+      in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
+    } else {
+      in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    }
 
     framework::DDim filter_data_dims =
         framework::slice_ddim(filter_dims, 2, filter_dims.size());
@@ -944,16 +934,12 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
 
     if (fuse_relu) {
       math::DepthwiseConvFunctor<DeviceContext, T, true> depthwiseConv;
-      depthwiseConv(dev_ctx, transformed_input, filter, strides, paddings,
-                    dilations, &transformed_output);
+      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
+                    output, data_layout);
     } else {
       math::DepthwiseConvFunctor<DeviceContext, T, false> depthwiseConv;
-      depthwiseConv(dev_ctx, transformed_input, filter, strides, paddings,
-                    dilations, &transformed_output);
-    }
-    if (channel_last) {
-      TransToChannelLast<DeviceContext, T>(context, &transformed_output,
-                                           output);
+      depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations,
+                    output, data_layout);
     }
   }
 };
@@ -981,33 +967,18 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
         context.Attr<std::string>("padding_algorithm");
     const std::string data_format = context.Attr<std::string>("data_format");
 
-    const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
-
-    // transform Tensor
-    Tensor transformed_input(input->type());
-    Tensor transformed_output_grad(output_grad->type());
-
-    if (channel_last) {
-      ResizeToChannelFirst<DeviceContext, T>(context, input,
-                                             &transformed_input);
-      TransToChannelFirst<DeviceContext, T>(context, input, &transformed_input);
-
-      ResizeToChannelFirst<DeviceContext, T>(context, output_grad,
-                                             &transformed_output_grad);
-      TransToChannelFirst<DeviceContext, T>(context, output_grad,
-                                            &transformed_output_grad);
-
-    } else {
-      transformed_input = *input;
-      transformed_output_grad = *output_grad;
-    }
-
     // update padding and dilation
-    auto in_dims = transformed_input.dims();
+    auto in_dims = input->dims();
     auto filter_dims = filter.dims();
 
     framework::DDim in_data_dims;
-    in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
+    const framework::DataLayout data_layout =
+        framework::StringToDataLayout(data_format);
+    if (data_layout != framework::DataLayout::kNHWC) {
+      in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
+    } else {
+      in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1);
+    }
     framework::DDim filter_data_dims =
         framework::slice_ddim(filter_dims, 2, filter_dims.size());
     std::vector<int> ksize = framework::vectorize<int>(filter_data_dims);
@@ -1025,33 +996,18 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
 
     if (input_grad) {
       input_grad->mutable_data<T>(context.GetPlace());
-      Tensor transformed_input_grad(input_grad->type());
-      if (channel_last) {
-        ResizeToChannelFirst<DeviceContext, T>(context, input_grad,
-                                               &transformed_input_grad);
-
-      } else {
-        transformed_input_grad = *input_grad;
-      }
-
-      set_zero(dev_ctx, &transformed_input_grad, static_cast<T>(0));
+      set_zero(dev_ctx, input_grad, static_cast<T>(0));
 
       if (fuse_relu) {
         math::DepthwiseConvInputGradFunctor<DeviceContext, T, true>
             depthwiseConvInputGrad;
-        depthwiseConvInputGrad(dev_ctx, transformed_input, filter,
-                               transformed_output_grad, strides, paddings,
-                               dilations, &transformed_input_grad);
+        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
+                               paddings, dilations, input_grad, data_layout);
       } else {
         math::DepthwiseConvInputGradFunctor<DeviceContext, T, false>
             depthwiseConvInputGrad;
-        depthwiseConvInputGrad(dev_ctx, transformed_input, filter,
-                               transformed_output_grad, strides, paddings,
-                               dilations, &transformed_input_grad);
-      }
-      if (channel_last) {
-        TransToChannelLast<DeviceContext, T>(context, &transformed_input_grad,
-                                             input_grad);
+        depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides,
+                               paddings, dilations, input_grad, data_layout);
       }
     }
 
@@ -1061,15 +1017,13 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
       if (fuse_relu) {
         math::DepthwiseConvFilterGradFunctor<DeviceContext, T, true>
             depthwiseConvFilterGrad;
-        depthwiseConvFilterGrad(dev_ctx, transformed_input,
-                                transformed_output_grad, strides, paddings,
-                                dilations, filter_grad);
+        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
+                                paddings, dilations, filter_grad, data_layout);
       } else {
         math::DepthwiseConvFilterGradFunctor<DeviceContext, T, false>
             depthwiseConvFilterGrad;
-        depthwiseConvFilterGrad(dev_ctx, transformed_input,
-                                transformed_output_grad, strides, paddings,
-                                dilations, filter_grad);
+        depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides,
+                                paddings, dilations, filter_grad, data_layout);
       }
     }
   }
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cu b/paddle/fluid/operators/detection/anchor_generator_op.cu
index b4c27a63dbd2f2..388b8531571086 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.cu
+++ b/paddle/fluid/operators/detection/anchor_generator_op.cu
@@ -49,14 +49,11 @@ __global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num,
     anchor_width = scale_w * base_w;
     anchor_height = scale_h * base_h;
 
-    T xmin = (x_ctr - 0.5 * (anchor_width - 1));
-    T ymin = (y_ctr - 0.5 * (anchor_height - 1));
-    T xmax = (x_ctr + 0.5 * (anchor_width - 1));
-    T ymax = (y_ctr + 0.5 * (anchor_height - 1));
-    out[i * 4] = xmin;
-    out[i * 4 + 1] = ymin;
-    out[i * 4 + 2] = xmax;
-    out[i * 4 + 3] = ymax;
+    T xmin = (x_ctr - .5f * (anchor_width - 1));
+    T ymin = (y_ctr - .5f * (anchor_height - 1));
+    T xmax = (x_ctr + .5f * (anchor_width - 1));
+    T ymax = (y_ctr + .5f * (anchor_height - 1));
+    reinterpret_cast<float4*>(out)[i] = make_float4(xmin, ymin, xmax, ymax);
   }
 }
 
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h
index e0e499d76a19ba..599f6935736f94 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.h
+++ b/paddle/fluid/operators/detection/anchor_generator_op.h
@@ -22,6 +22,19 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+extern __global__ void GenAnchors(T* out, const T* aspect_ratios,
+                                  const int ar_num, const T* anchor_sizes,
+                                  const int as_num, const T* stride,
+                                  const int sd_num, const int height,
+                                  const int width, const T offset);
+
+template <typename T>
+extern __global__ void SetVariance(T* out, const T* var, const int vnum,
+                                   const int num);
+#endif
+
 template <typename T>
 class AnchorGeneratorOpKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/eigen/CMakeLists.txt b/paddle/fluid/operators/eigen/CMakeLists.txt
new file mode 100644
index 00000000000000..848bf2433c5e39
--- /dev/null
+++ b/paddle/fluid/operators/eigen/CMakeLists.txt
@@ -0,0 +1,10 @@
+file(GLOB EIGEN_CC_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
+cc_library(eigen_cc_function SRCS ${EIGEN_CC_SOURCES} DEPS eigen3)
+if(WITH_GPU OR WITH_ROCM)
+  file(GLOB EIGEN_CU_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu")
+  if(WITH_GPU)
+    nv_library(eigen_cu_function SRCS ${EIGEN_CU_SOURCES} DEPS eigen3)
+  elseif(WITH_ROCM)
+    hip_library(eigen_cu_function SRCS ${EIGEN_CU_SOURCES} DEPS eigen3)
+  endif()
+endif()
diff --git a/paddle/fluid/operators/eigen/broadcast.cc b/paddle/fluid/operators/eigen/broadcast.cc
new file mode 100644
index 00000000000000..dab25f95493726
--- /dev/null
+++ b/paddle/fluid/operators/eigen/broadcast.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int Rank>
+struct EigenBroadcast<Eigen::DefaultDevice, T, Rank> {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out, InType in,
+                   const Array& bcast) {
+    out.device(dev) = in.broadcast(bcast);
+  }
+
+  static void Eval(const Eigen::DefaultDevice& dev, OutType32BitIndex out,
+                   InType32BitIndex in, const Array& bcast) {
+    out.device(dev) = in.broadcast(bcast);
+  }
+};
+
+template <typename T, int Rank>
+struct EigenBroadcastGrad<Eigen::DefaultDevice, T, Rank> {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using Array2 = Eigen::DSizes<Eigen::DenseIndex, Rank * 2>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out, InType in,
+                   const Array& reduce_dims, const Array2& reshape_dims) {
+    out.device(dev) =
+        in.reshape(reshape_dims).sum(reduce_dims).reshape(out.dimensions());
+  }
+};
+
+#define INSTANTIATION(FUNCTOR, T)                      \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 1>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 2>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 3>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 4>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 5>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, T, 6>
+INSTANTIATION(EigenBroadcast, bool);
+INSTANTIATION(EigenBroadcast, platform::float16);
+INSTANTIATION(EigenBroadcast, float);
+INSTANTIATION(EigenBroadcast, double);
+INSTANTIATION(EigenBroadcast, int);
+INSTANTIATION(EigenBroadcast, int64_t);
+INSTANTIATION(EigenBroadcastGrad, bool);
+INSTANTIATION(EigenBroadcastGrad, float);
+INSTANTIATION(EigenBroadcastGrad, platform::float16);
+INSTANTIATION(EigenBroadcastGrad, double);
+INSTANTIATION(EigenBroadcastGrad, int);
+INSTANTIATION(EigenBroadcastGrad, int64_t);
+template struct EigenBroadcastGrad<Eigen::DefaultDevice, float, 0>;
+template struct EigenBroadcastGrad<Eigen::DefaultDevice, double, 0>;
+template struct EigenBroadcastGrad<Eigen::DefaultDevice, int, 0>;
+template struct EigenBroadcastGrad<Eigen::DefaultDevice, int64_t, 0>;
+#undef INSTANTIATION
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/broadcast.cu b/paddle/fluid/operators/eigen/broadcast.cu
new file mode 100644
index 00000000000000..63e244d393a9bc
--- /dev/null
+++ b/paddle/fluid/operators/eigen/broadcast.cu
@@ -0,0 +1,87 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int Rank>
+struct EigenBroadcast<Eigen::GpuDevice, T, Rank> {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, InType in,
+                   const Array& bcast) {
+    out.device(dev) = in.broadcast(bcast);
+  }
+
+  static void Eval(const Eigen::GpuDevice& dev, OutType32BitIndex out,
+                   InType32BitIndex in, const Array& bcast) {
+    out.device(dev) = in.broadcast(bcast);
+  }
+};
+
+template <typename T, int Rank>
+struct EigenBroadcastGrad<Eigen::GpuDevice, T, Rank> {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using Array2 = Eigen::DSizes<Eigen::DenseIndex, Rank * 2>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, InType in,
+                   const Array& reduce_dims, const Array2& reshape_dims) {
+    out.device(dev) =
+        in.reshape(reshape_dims).sum(reduce_dims).reshape(out.dimensions());
+  }
+};
+
+#define INSTANTIATION(FUNCTOR, T)                  \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 1>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 2>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 3>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 4>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 5>; \
+  template struct FUNCTOR<Eigen::GpuDevice, T, 6>
+INSTANTIATION(EigenBroadcast, bool);
+INSTANTIATION(EigenBroadcast, platform::float16);
+INSTANTIATION(EigenBroadcast, float);
+INSTANTIATION(EigenBroadcast, double);
+INSTANTIATION(EigenBroadcast, int);
+INSTANTIATION(EigenBroadcast, int64_t);
+INSTANTIATION(EigenBroadcastGrad, bool);
+INSTANTIATION(EigenBroadcastGrad, float);
+INSTANTIATION(EigenBroadcastGrad, platform::float16);
+INSTANTIATION(EigenBroadcastGrad, double);
+INSTANTIATION(EigenBroadcastGrad, int);
+INSTANTIATION(EigenBroadcastGrad, int64_t);
+template struct EigenBroadcastGrad<Eigen::GpuDevice, float, 0>;
+template struct EigenBroadcastGrad<Eigen::GpuDevice, platform::float16, 0>;
+template struct EigenBroadcastGrad<Eigen::GpuDevice, double, 0>;
+template struct EigenBroadcastGrad<Eigen::GpuDevice, int, 0>;
+template struct EigenBroadcastGrad<Eigen::GpuDevice, int64_t, 0>;
+#undef INSTANTIATION
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/eigen_function.h b/paddle/fluid/operators/eigen/eigen_function.h
new file mode 100644
index 00000000000000..59669505959f3f
--- /dev/null
+++ b/paddle/fluid/operators/eigen/eigen_function.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace operators {
+
+template <typename EigenDevice, typename T, int Rank>
+struct EigenBroadcast {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  static void Eval(const EigenDevice& dev, OutType out, InType in,
+                   const Array& bcast);
+  static void Eval(const EigenDevice& dev, OutType32BitIndex out,
+                   InType32BitIndex in, const Array& bcast);
+};
+
+template <typename EigenDevice, typename T, int Rank>
+struct EigenBroadcastGrad {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using Array2 = Eigen::DSizes<Eigen::DenseIndex, Rank * 2>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, InType in,
+                   const Array& reduce_dims, const Array2& reshape_dims);
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 8de6416065d9a7..313607d975e60a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -24,7 +24,10 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-struct SameDimsElemwiseAdd<platform::CUDADeviceContext, T> {
+struct SameDimsElemwiseAdd<
+    platform::CUDADeviceContext, T,
+    typename std::enable_if<!std::is_same<T, platform::float16>::value &&
+                            !std::is_same<T, float>::value>::type> {
   void operator()(const framework::ExecutionContext& ctx,
                   const framework::Tensor* x, const framework::Tensor* y,
                   framework::Tensor* z) {
@@ -36,38 +39,68 @@ struct SameDimsElemwiseAdd<platform::CUDADeviceContext, T> {
   }
 };
 
-template <>
-struct SameDimsElemwiseAdd<platform::CUDADeviceContext, platform::float16> {
+template <typename T>
+struct SameDimsElemwiseAdd<
+    platform::CUDADeviceContext, T,
+    typename std::enable_if<std::is_same<T, platform::float16>::value ||
+                            std::is_same<T, float>::value>::type> {
   void operator()(const framework::ExecutionContext& ctx,
                   const framework::Tensor* x, const framework::Tensor* y,
                   framework::Tensor* z) {
     auto size = x->numel();
-    dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) /
-                              PADDLE_CUDA_THREAD_SIZE,
-                          1);
+    int vec_size = sizeof(float4) / sizeof(T);
+    dim3 grid_size =
+        dim3(((size + vec_size - 1) / vec_size + PADDLE_CUDA_THREAD_SIZE - 1) /
+                 PADDLE_CUDA_THREAD_SIZE,
+             1);
     dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
-    const half* x2 =
-        reinterpret_cast<const half*>(x->data<platform::float16>());
-    const half* y2 =
-        reinterpret_cast<const half*>(y->data<platform::float16>());
-    half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
-    SameDimsElemwiseAddCUDAKernel<<<
-        grid_size, block_size, 0,
-        ctx.template device_context<platform::CUDADeviceContext>().stream()>>>(
-        x2, y2, z2, size);
+    if (std::is_same<T, float>::value) {
+      SameDimsElemwiseAddCUDAKernel<<<
+          grid_size, block_size, 0,
+          ctx.template device_context<platform::CUDADeviceContext>()
+              .stream()>>>(x->data<float>(), y->data<float>(), z->data<float>(),
+                           size);
+    } else {
+      const half* x2 =
+          reinterpret_cast<const half*>(x->data<platform::float16>());
+      const half* y2 =
+          reinterpret_cast<const half*>(y->data<platform::float16>());
+      half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
+      SameDimsElemwiseAddCUDAKernel<<<
+          grid_size, block_size, 0,
+          ctx.template device_context<platform::CUDADeviceContext>()
+              .stream()>>>(x2, y2, z2, size);
+    }
   }
 };
 
 template <typename T>
-static __global__ void SimpleElemwiseAddGradCUDAKernel(const T* dout,
-                                                       int64_t size, T* dx,
-                                                       T* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
+static __global__ void SimpleElemwiseAddGradCUDAKernel(
+    const T* __restrict__ dout, int size, int vec_size, T* dx, T* dy) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = gridDim.x * blockDim.x;
+  int loop = size / vec_size;
+  int remainder = size % vec_size;
+  const float4* dout_vec = reinterpret_cast<const float4*>(dout);
+  float4* dx_vec = reinterpret_cast<float4*>(dx);
+  float4* dy_vec = reinterpret_cast<float4*>(dy);
+  float4 tmp_loop;
+
+  for (int i = tid; i < loop; i += stride) {
+    tmp_loop = dout_vec[i];
+    dx_vec[i] = tmp_loop;
+    dy_vec[i] = tmp_loop;
+  }
 
-  while (col < size) {
-    dx[col] = dout[col];
-    dy[col] = dout[col];
-    col += blockDim.x * gridDim.x;
+  if (tid == loop && remainder != 0) {
+    T tmp_rem;
+    while (remainder) {
+      int idx = size - remainder;
+      remainder--;
+      tmp_rem = dout[idx];
+      dx[idx] = tmp_rem;
+      dy[idx] = tmp_rem;
+    }
   }
 }
 
@@ -79,15 +112,39 @@ elementwise_add_grad(const framework::ExecutionContext& ctx,
                      const framework::Tensor* out,
                      const framework::Tensor* dout, framework::Tensor* dx,
                      framework::Tensor* dy) {
-  dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
-  auto size = x->numel();
-  dim3 grid_size =
-      dim3((size + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1);
-  SimpleElemwiseAddGradCUDAKernel<
-      T><<<grid_size, block_size, 0,
-           ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
-      dout->data<T>(), size, dx->mutable_data<T>(ctx.GetPlace()),
-      dy->mutable_data<T>(ctx.GetPlace()));
+  auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+  auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
+  auto* dout_data = dout->data<T>();
+  if (dx_data == dout_data && dy_data != dout_data) {
+    VLOG(4) << "Special case when dx_data is the same as dout_data, "
+               "only need copy dout to dy";
+    framework::TensorCopy(
+        *dout, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), dy);
+  } else if (dx_data != dout_data && dy_data == dout_data) {
+    VLOG(4) << "Special case when dy_data is the same as dout_data, "
+               "only need copy dout to dx";
+    framework::TensorCopy(
+        *dout, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), dx);
+  } else if (dx_data != dout_data && dy_data != dout_data) {
+    auto size = x->numel();
+    int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
+    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
+    dim3 grid_size =
+        dim3(((size + vec_size - 1) / vec_size + PADDLE_CUDA_THREAD_SIZE - 1) /
+                 PADDLE_CUDA_THREAD_SIZE,
+             1);
+    SimpleElemwiseAddGradCUDAKernel<
+        T><<<grid_size, block_size, 0,
+             ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
+        dout->data<T>(), size, vec_size, dx->mutable_data<T>(ctx.GetPlace()),
+        dy->mutable_data<T>(ctx.GetPlace()));
+  } else {
+    VLOG(4) << "Special case when dy_data is the same as dout_data, "
+               "and dx_data is the same as dout_data, do not need "
+               "any operator";
+  }
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
index 96583d06571c82..0cf9294c9de67f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -43,7 +43,7 @@ struct SameDimsElemwiseDiv<platform::CUDADeviceContext, platform::float16> {
                   const framework::Tensor* x, const framework::Tensor* y,
                   framework::Tensor* z) {
     auto size = x->numel();
-    dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) /
+    dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
                               PADDLE_CUDA_THREAD_SIZE,
                           1);
     dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 5b598ab2d788eb..e01b5eb5fb73d9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -43,7 +43,7 @@ struct SameDimsElemwiseMul<platform::CUDADeviceContext, platform::float16> {
                   const framework::Tensor* x, const framework::Tensor* y,
                   framework::Tensor* z) {
     auto size = x->numel();
-    dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) /
+    dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
                               PADDLE_CUDA_THREAD_SIZE,
                           1);
     dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h
index 1121d0ef68ce2f..8344b3d9838b00 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h
@@ -18,7 +18,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#ifdef __HIPCC__
+#define PADDLE_CUDA_THREAD_SIZE 256
+#else
 #define PADDLE_CUDA_THREAD_SIZE 512
+#endif
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
@@ -34,10 +38,6 @@ limitations under the License. */
 #endif
 #endif  // PADDLE_WITH_HIP
 
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000
-#define __h2div h2div
-#endif
-
 #define DIV_ERROR_INFO                                                     \
   "InvalidArgumentError: Integer division by zero encountered in divide. " \
   "Please check.\n"
@@ -162,32 +162,62 @@ inline DEVICE half2 half2_div(const half2& a, const half2& b) {
 #endif
 }
 
-#define DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Func, expr, FP16Function)           \
-  template <typename T>                                                      \
-  __global__ void SameDimsElemwise##Func##CUDAKernel(const T* x, const T* y, \
-                                                     T* z, int64_t size) {   \
-    int col = blockIdx.x * blockDim.x + threadIdx.x;                         \
-    while (col < size) {                                                     \
-      z[col] = x[col] expr y[col];                                           \
-      col += blockDim.x * gridDim.x;                                         \
-    }                                                                        \
-  }                                                                          \
-  template <>                                                                \
-  inline __global__ void SameDimsElemwise##Func##CUDAKernel<half>(           \
-      const half* x, const half* y, half* z, int64_t size) {                 \
-    int start = threadIdx.x + blockDim.x * blockIdx.x;                       \
-    int stride = blockDim.x * gridDim.x;                                     \
-    int n2 = size / 2;                                                       \
-    const half2* x2 = reinterpret_cast<const half2*>(x);                     \
-    const half2* y2 = reinterpret_cast<const half2*>(y);                     \
-    half2* z2 = reinterpret_cast<half2*>(z);                                 \
-    for (int i = start; i < n2; i += stride) {                               \
-      z2[i] = FP16Function(x2[i], y2[i]);                                    \
-    }                                                                        \
-    if (start == 0 && (size % 2)) {                                          \
-      z[size - 1] = __float2half(__half2float(x[size - 1])                   \
-                                     expr __half2float(y[size - 1]));        \
-    }                                                                        \
+#define DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Func, expr, FP16Function)             \
+  inline __global__ void SameDimsElemwise##Func##CUDAKernel(                   \
+      const float* __restrict__ x, const float* __restrict__ y, float* z,      \
+      int64_t size) {                                                          \
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;                           \
+    int stride = gridDim.x * blockDim.x;                                       \
+    int loop = size / 4;                                                       \
+    int remainder = size % 4;                                                  \
+    const float4* x_vec = reinterpret_cast<const float4*>(x);                  \
+    const float4* y_vec = reinterpret_cast<const float4*>(y);                  \
+    float4* z_vec = reinterpret_cast<float4*>(z);                              \
+    float4 x_f4, y_f4;                                                         \
+    for (int i = tid; i < loop; i += stride) {                                 \
+      x_f4 = x_vec[i];                                                         \
+      y_f4 = y_vec[i];                                                         \
+      z_vec[i] = make_float4(x_f4.x expr y_f4.x, x_f4.y expr y_f4.y,           \
+                             x_f4.z expr y_f4.z, x_f4.w expr y_f4.w);          \
+    }                                                                          \
+    if (tid == loop && remainder != 0) {                                       \
+      while (remainder) {                                                      \
+        int idx = size - remainder;                                            \
+        remainder--;                                                           \
+        z[idx] = x[idx] expr y[idx];                                           \
+      }                                                                        \
+    }                                                                          \
+  }                                                                            \
+  inline __global__ void SameDimsElemwise##Func##CUDAKernel(                   \
+      const half* __restrict__ x, const half* __restrict__ y, half* z,         \
+      int64_t size) {                                                          \
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;                           \
+    int stride = gridDim.x * blockDim.x;                                       \
+    int loop = size / 8;                                                       \
+    int remainder = size % 8;                                                  \
+    const float4* x_vec = reinterpret_cast<const float4*>(x);                  \
+    const float4* y_vec = reinterpret_cast<const float4*>(y);                  \
+    float4* z_vec = reinterpret_cast<float4*>(z);                              \
+    float4 x_h8, y_h8, z_h8;                                                   \
+    for (int i = tid; i < loop; i += stride) {                                 \
+      x_h8 = x_vec[i];                                                         \
+      y_h8 = y_vec[i];                                                         \
+      half2* x_h2 = reinterpret_cast<half2*>(&x_h8);                           \
+      half2* y_h2 = reinterpret_cast<half2*>(&y_h8);                           \
+      half2* z_h2 = reinterpret_cast<half2*>(&z_h8);                           \
+      z_h2[0] = FP16Function(x_h2[0], y_h2[0]);                                \
+      z_h2[1] = FP16Function(x_h2[1], y_h2[1]);                                \
+      z_h2[2] = FP16Function(x_h2[2], y_h2[2]);                                \
+      z_h2[3] = FP16Function(x_h2[3], y_h2[3]);                                \
+      z_vec[i] = z_h8;                                                         \
+    }                                                                          \
+    if (tid == loop && remainder != 0) {                                       \
+      while (remainder) {                                                      \
+        int idx = size - remainder;                                            \
+        remainder--;                                                           \
+        z[idx] = __float2half(__half2float(x[idx]) expr __half2float(y[idx])); \
+      }                                                                        \
+    }                                                                          \
   }
 DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Add, +, half2_add)
 DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Sub, -, half2_sub)
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index 1996cc471ac2a0..192999fd2ac831 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -43,7 +43,7 @@ struct SameDimsElemwiseSub<platform::CUDADeviceContext, platform::float16> {
                   const framework::Tensor* x, const framework::Tensor* y,
                   framework::Tensor* z) {
     auto size = x->numel();
-    dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) /
+    dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
                               PADDLE_CUDA_THREAD_SIZE,
                           1);
     dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
diff --git a/paddle/fluid/operators/expand_as_op.h b/paddle/fluid/operators/expand_as_op.h
index cbaeb0c4e42564..4cefadb24ec5d8 100644
--- a/paddle/fluid/operators/expand_as_op.h
+++ b/paddle/fluid/operators/expand_as_op.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
 
@@ -75,7 +76,7 @@ class ExpandAsKernel : public framework::OpKernel<T> {
     auto in_dims = in0->dims();
     auto* target_tensor = context.Input<Tensor>("target_tensor");
     auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     int bcast_dims_remainder = 0;
     auto x_dims = in0->dims();
     auto y_dims = target_tensor->dims();
@@ -104,7 +105,8 @@ class ExpandAsKernel : public framework::OpKernel<T> {
     auto y = EigenTensor<T, Rank>::From(*out0);
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    y.device(place) = x.broadcast(bcast_dims);
+    EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                 bcast_dims);
   }
 };
 
@@ -165,20 +167,19 @@ class ExpandAsGradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
     for (size_t i = 0; i < reshape_size; ++i) {
       reshape_dims[i] = reshape_dims_vec[i];
     }
-    Eigen::DSizes<int, Dims> reduce_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
     for (size_t i = 0; i < reduce_size; ++i) {
       reduce_dims[i] = reduce_dims_vec[i];
     }
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims)
-            .sum(reduce_dims)
-            .reshape(x_grad.dimensions());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+        place, x_grad, out_grad, reduce_dims, reshape_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h
index c36e461926f5c1..441dd3538045cf 100644
--- a/paddle/fluid/operators/expand_as_v2_op.h
+++ b/paddle/fluid/operators/expand_as_v2_op.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
 
@@ -108,7 +109,7 @@ class ExpandAsV2Kernel : public framework::OpKernel<T> {
       }
     }
     auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     for (size_t i = 0; i < repeat_times.size(); ++i) {
       bcast_dims[i] = repeat_times[i];
     }
@@ -122,7 +123,8 @@ class ExpandAsV2Kernel : public framework::OpKernel<T> {
     auto y = EigenTensor<T, Rank>::From(*out0, out_dims);
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    y.device(place) = x.broadcast(bcast_dims);
+    EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                 bcast_dims);
   }
 };
 
@@ -191,20 +193,19 @@ class ExpandAsV2GradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
     for (size_t i = 0; i < reshape_size; ++i) {
       reshape_dims[i] = reshape_dims_vec[i];
     }
-    Eigen::DSizes<int, Dims> reduce_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
     for (size_t i = 0; i < reduce_size; ++i) {
       reduce_dims[i] = reduce_dims_vec[i];
     }
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims)
-            .sum(reduce_dims)
-            .reshape(x_grad.dimensions());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+        place, x_grad, out_grad, reduce_dims, reshape_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h
index 8b79a1feb8ce1f..abd525497d6849 100644
--- a/paddle/fluid/operators/expand_op.h
+++ b/paddle/fluid/operators/expand_op.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
 
@@ -141,7 +142,7 @@ class ExpandKernel : public framework::OpKernel<T> {
             "of dimensions (%d) of the input.",
             expand_times.size(), static_cast<size_t>(in_dims.size())));
     auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     for (size_t i = 0; i < expand_times.size(); ++i) {
       bcast_dims[i] = expand_times[i];
     }
@@ -160,9 +161,11 @@ class ExpandKernel : public framework::OpKernel<T> {
     // use 32-bit index to speed up
     bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
     if (use_32bit_index) {
-      To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+          place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
     } else {
-      y.device(place) = x.broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                   bcast_dims);
     }
   }
 };
@@ -241,20 +244,19 @@ class ExpandGradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
     for (size_t i = 0; i < reshape_size; ++i) {
       reshape_dims[i] = reshape_dims_vec[i];
     }
-    Eigen::DSizes<int, Dims> reduce_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
     for (size_t i = 0; i < reduce_size; ++i) {
       reduce_dims[i] = reduce_dims_vec[i];
     }
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims)
-            .sum(reduce_dims)
-            .reshape(x_grad.dimensions());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+        place, x_grad, out_grad, reduce_dims, reshape_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h
index ec9c6e62f272ed..af5fdf22cd906c 100644
--- a/paddle/fluid/operators/expand_v2_op.h
+++ b/paddle/fluid/operators/expand_v2_op.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
 
@@ -174,7 +175,7 @@ class ExpandV2Kernel : public framework::OpKernel<T> {
     }
 
     auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     for (size_t i = 0; i < repeat_times.size(); ++i) {
       bcast_dims[i] = repeat_times[i];
     }
@@ -194,9 +195,11 @@ class ExpandV2Kernel : public framework::OpKernel<T> {
     // use 32-bit index to speed up
     bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
     if (use_32bit_index) {
-      To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+          place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
     } else {
-      y.device(place) = x.broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                   bcast_dims);
     }
   }
 };
@@ -275,20 +278,19 @@ class ExpandV2GradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
     for (size_t i = 0; i < reshape_size; ++i) {
       reshape_dims[i] = reshape_dims_vec[i];
     }
-    Eigen::DSizes<int, Dims> reduce_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
     for (size_t i = 0; i < reduce_size; ++i) {
       reduce_dims[i] = reduce_dims_vec[i];
     }
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims)
-            .sum(reduce_dims)
-            .reshape(x_grad.dimensions());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+        place, x_grad, out_grad, reduce_dims, reshape_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 8a96d057cbe039..caa29309901932 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -149,6 +149,7 @@ REGISTER_OPERATOR(
 
 REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
                        ops::FillConstantKernel<double>,
+                       ops::FillConstantKernel<uint8_t>,
                        ops::FillConstantKernel<int64_t>,
                        ops::FillConstantKernel<int>,
                        ops::FillConstantKernel<bool>,
diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/fill_constant_op.cu.cc
index 78c62a4053b641..e784c20b8b8b4f 100644
--- a/paddle/fluid/operators/fill_constant_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_op.cu.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel<float>,
                         ops::FillConstantKernel<double>,
+                        ops::FillConstantKernel<uint8_t>,
                         ops::FillConstantKernel<int64_t>,
                         ops::FillConstantKernel<int>,
                         ops::FillConstantKernel<bool>,
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index 84fa0d6af990e2..55662e1d0aad7a 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -66,18 +66,23 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
 
     int batch_size = prob->dims()[0];
     int class_num = prob->dims()[1];
+#ifdef __HIPCC__
+    constexpr int kMaxBlockDim = 256;
+#else
+    constexpr int kMaxBlockDim = 512;
+#endif
 
     if (softLabel) {
       const T* label_data = labels->data<T>();
-      int block = class_num > 512
-                      ? 512
+      int block = class_num > kMaxBlockDim
+                      ? kMaxBlockDim
                       : pow(2, static_cast<int>(std::log2(class_num)));
 
       SoftCrossEntropyKernel<T><<<batch_size, block, 0, ctx.stream()>>>(
           loss_data, prob_data, label_data, class_num);
     } else {
       const int64_t* label_data = labels->data<int64_t>();
-      int block = 512;
+      int block = kMaxBlockDim;
       int grid = (batch_size + block - 1) / block;
       CrossEntropyKernel<T><<<grid, block, 0, ctx.stream()>>>(
           loss_data, prob_data, label_data, batch_size, class_num,
diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
index 7439a959d38285..5fd543b5c6c5cc 100644
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -22,6 +22,7 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/math/depthwise_conv.h"
+#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -52,8 +53,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
       const int filter_multiplier, const int filter_height,                    \
       const int filter_width, const int stride_height, const int stride_width, \
       const int padding_height, const int padding_width,                       \
-      const int dilate_height, const int dilate_width, T *const output_data,   \
-      const DataLayout data_layout = DataLayout::kNCHW
+      const int dilate_height, const int dilate_width, T *const output_data
 
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NCHW format.
@@ -123,7 +123,6 @@ __device__ __inline__ void KernelDepthwiseConvNHWC(
   const int batch = idx / output_width / output_height / output_channels;
 
   const int c_in = c_out / filter_multiplier;
-  const T* weight = filter_data + c_out * filter_height * filter_width;
   T value = 0;
   const int h_in_start = -padding_height + h_out * stride_height;
   const int w_in_start = -padding_width + w_out * stride_width;
@@ -142,13 +141,14 @@ __device__ __inline__ void KernelDepthwiseConvNHWC(
     for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) {
       if (h_in >= h_start && h_in < h_end && w_in >= w_start && w_in < w_end) {
         int offset = ((batch * input_height + h_in) * input_width + w_in) *
-                         output_channels +
+                         input_channels +
                      c_in;
         T in_data = input_data[offset];
+        const T* weight = filter_data + weight_offset * output_channels + c_out;
         if (fuse_relu_before_conv) {
-          value += weight[weight_offset] * max(0.0f, in_data);
+          value += weight[0] * max(0.0f, in_data);
         } else {
-          value += weight[weight_offset] * in_data;
+          value += weight[0] * in_data;
         }
       }
       weight_offset++;
@@ -161,10 +161,10 @@ __device__ __inline__ void KernelDepthwiseConvNHWC(
 }
 
 template <typename T, int c_filter, bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConvCFilter(
+__device__ __inline__ void KernelDepthwiseConvCFilterNCHW(
     ARG_DEFINE_KernelDepthwiseConv) {
-  const int kWeghtSize = c_filter * c_filter;
-  T r_weight[kWeghtSize];
+  const int kWeightSize = c_filter * c_filter;
+  T r_weight[kWeightSize];
   const int batch = blockIdx.y;
   const int c_out = blockIdx.x;
   const T* weight = filter_data + c_out * c_filter * c_filter;
@@ -182,13 +182,8 @@ __device__ __inline__ void KernelDepthwiseConvCFilter(
       const int h_in_end = h_in_start + c_filter * dilate_height;
       const int w_in_end = w_in_start + c_filter * dilate_width;
 
-      int in_offset;
-      if (data_layout != DataLayout::kNHWC) {
-        in_offset =
-            ((batch * input_channels + c_in) * input_height) * input_width;
-      } else {
-        in_offset = batch * input_height * input_width * input_channels;
-      }
+      int in_offset =
+          ((batch * input_channels + c_in) * input_height) * input_width;
 
       const int h_end = h_in_end < input_height ? h_in_end : input_height;
       const int w_end = w_in_end < input_width ? w_in_end : input_width;
@@ -201,13 +196,63 @@ __device__ __inline__ void KernelDepthwiseConvCFilter(
              w_in += dilate_width, w_f++) {
           if (h_in >= 0 && h_in < input_height && w_in >= 0 &&
               w_in < input_width) {
-            int offset;
-            if (data_layout != DataLayout::kNHWC) {
-              offset = in_offset + h_in * input_width + w_in;
+            int offset = in_offset + h_in * input_width + w_in;
+            if (fuse_relu_before_conv) {
+              value += r_weight[h_f * c_filter + w_f] *
+                       max(0.0f, input_data[offset]);
             } else {
-              offset = in_offset +
-                       (h_in * input_width + w_in) * input_channels + c_in;
+              value += r_weight[h_f * c_filter + w_f] * input_data[offset];
             }
+          }
+        }
+      }
+      int index =
+          ((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
+          w_out;
+      output_data[index] = value;
+    }
+  }
+}
+
+template <typename T, int c_filter, bool fuse_relu_before_conv>
+__device__ __inline__ void KernelDepthwiseConvCFilterNHWC(
+    ARG_DEFINE_KernelDepthwiseConv) {
+  const int batch = blockIdx.z;
+  int h_out = blockIdx.x * dilate_height + blockIdx.y;
+  if (h_out >= output_height) {
+    return;
+  }
+  int in_offset = batch * input_height * input_width * input_channels;
+  int out_offset =
+      (batch * output_height + h_out) * output_width * output_channels;
+  const int h_in_start = -padding_height + h_out * stride_height;
+  const int wi_size = (output_width + dilate_width - 1) / dilate_width;
+  const int kWeightSize = c_filter * c_filter;
+  T r_weight[kWeightSize];
+
+  for (int c_out = threadIdx.x; c_out < output_channels; c_out += blockDim.x) {
+    for (int i = 0; i < c_filter * c_filter; i++) {
+      const T* weight = filter_data + i * output_channels + c_out;
+      r_weight[i] = weight[0];
+    }
+    const int c_in = c_out / filter_multiplier;
+    for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) {
+      int i_dw = i / wi_size;
+      int i_wi = i - i_dw * wi_size;
+      int w_out = i_wi * dilate_width + i_dw;
+      if (w_out >= output_width) {
+        continue;
+      }
+      T value = 0;
+      const int w_in_start = -padding_width + w_out * stride_width;
+      for (int h_in = h_in_start, h_f = 0; h_f < c_filter;
+           h_in += dilate_height, h_f++) {
+        for (int w_in = w_in_start, w_f = 0; w_f < c_filter;
+             w_in += dilate_width, w_f++) {
+          if (h_in >= 0 && h_in < input_height && w_in >= 0 &&
+              w_in < input_width) {
+            int offset =
+                in_offset + (h_in * input_width + w_in) * input_channels + c_in;
             if (fuse_relu_before_conv) {
               value += r_weight[h_f * c_filter + w_f] *
                        max(0.0f, input_data[offset]);
@@ -217,23 +262,14 @@ __device__ __inline__ void KernelDepthwiseConvCFilter(
           }
         }
       }
-      int index;
-      if (data_layout != DataLayout::kNHWC) {
-        index = ((batch * gridDim.x + c_out) * output_height + h_out) *
-                    output_width +
-                w_out;
-      } else {
-        index = ((batch * output_height + h_out) * output_width + w_out) *
-                    gridDim.x +
-                c_out;
-      }
+      int index = out_offset + w_out * output_channels + c_out;
       output_data[index] = value;
     }
   }
 }
 
 template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
-          bool fuse_relu_before_conv>
+          DataLayout data_layout, bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
   int final_filter_multiplier = filter_multiplier;
   int h_stride = stride_height;
@@ -244,28 +280,37 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
     w_stride = c_stride;
   }
   if (c_filter == -1) {
-    if (data_layout == DataLayout::kNCHW) {
+    if (data_layout != DataLayout::kNHWC) {
       KernelDepthwiseConvNCHW<T, fuse_relu_before_conv>(
           input_data, filter_data, batch_size, output_channels, output_height,
           output_width, input_channels, input_height, input_width,
           final_filter_multiplier, filter_height, filter_width, h_stride,
           w_stride, padding_height, padding_width, dilate_height, dilate_width,
-          output_data, data_layout);
+          output_data);
     } else {
       KernelDepthwiseConvNHWC<T, fuse_relu_before_conv>(
           input_data, filter_data, batch_size, output_channels, output_height,
           output_width, input_channels, input_height, input_width,
           final_filter_multiplier, filter_height, filter_width, h_stride,
           w_stride, padding_height, padding_width, dilate_height, dilate_width,
-          output_data, data_layout);
+          output_data);
     }
   } else {
-    KernelDepthwiseConvCFilter<T, c_filter, fuse_relu_before_conv>(
-        input_data, filter_data, batch_size, output_channels, output_height,
-        output_width, input_channels, input_height, input_width,
-        final_filter_multiplier, filter_height, filter_width, h_stride,
-        w_stride, padding_height, padding_width, dilate_height, dilate_width,
-        output_data, data_layout);
+    if (data_layout != DataLayout::kNHWC) {
+      KernelDepthwiseConvCFilterNCHW<T, c_filter, fuse_relu_before_conv>(
+          input_data, filter_data, batch_size, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_data);
+    } else {
+      KernelDepthwiseConvCFilterNHWC<T, c_filter, fuse_relu_before_conv>(
+          input_data, filter_data, batch_size, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          output_data);
+    }
   }
 }
 
@@ -280,40 +325,27 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
       const int filter_width, const int stride_height, const int stride_width, \
       const int padding_height, const int padding_width,                       \
       const int dilate_height, const int dilate_width,                         \
-      T *const input_grad_data,                                                \
-      const DataLayout data_layout = DataLayout::kNCHW
+      T *const input_grad_data
 
 template <typename T, bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConvInputGrad(
+__device__ __inline__ void KernelDepthwiseConvInputGradNCHW(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
+  const int batch = blockIdx.y;
+  const int c_in = blockIdx.x;
   for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
     for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
-      const int batch = blockIdx.y;
-      const int c_in = blockIdx.x;
-
       const int c_out_start = c_in * filter_multiplier;
-
       int h_out_start =
           h_in - (filter_height - 1) * dilate_height + padding_height;
-
       int h_out_end = h_in + padding_height;
-
       int w_out_start =
           w_in - (filter_width - 1) * dilate_width + padding_width;
-
       int w_out_end = w_in + padding_width;
 
       T value = 0;
-      int index;
-      if (data_layout != DataLayout::kNHWC) {
-        index =
-            ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
-            w_in;
-      } else {
-        index =
-            ((batch * input_height + h_in) * input_width + w_in) * gridDim.x +
-            c_in;
-      }
+      int index =
+          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
+          w_in;
 
       if (fuse_relu_before_conv) {
         if (input_data[index] <= 0) {
@@ -335,20 +367,67 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad(
             if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
                 s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
                 s_w_out < output_width) {
-              int output_grad_offset;
-              if (data_layout != DataLayout::kNHWC) {
-                output_grad_offset =
-                    ((batch * output_channels + c_out) * output_height +
-                     s_h_out) *
-                        output_width +
-                    s_w_out;
-              } else {
-                output_grad_offset =
-                    ((batch * output_height + s_h_out) * output_width +
-                     s_w_out) *
-                        output_channels +
-                    c_out;
-              }
+              int output_grad_offset =
+                  ((batch * output_channels + c_out) * output_height +
+                   s_h_out) *
+                      output_width +
+                  s_w_out;
+              value += output_grad_data[output_grad_offset] *
+                       filter_data[filter_offset];
+            }
+          }
+        }
+      }
+      input_grad_data[index] = value;
+    }
+  }
+}
+
+template <typename T, bool fuse_relu_before_conv>
+__device__ __inline__ void KernelDepthwiseConvInputGradNHWC(
+    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
+  const int batch = blockIdx.z;
+  int h_in = blockIdx.x * dilate_height + blockIdx.y;
+  if (h_in >= input_height) {
+    return;
+  }
+
+  for (int c_in = threadIdx.x; c_in < input_channels; c_in += blockDim.x) {
+    for (int w_in = threadIdx.y; w_in < input_width; w_in += blockDim.y) {
+      int h_out_start =
+          h_in - (filter_height - 1) * dilate_height + padding_height;
+      int w_out_start =
+          w_in - (filter_width - 1) * dilate_width + padding_width;
+
+      T value = 0;
+      int index = ((batch * input_height + h_in) * input_width + w_in) *
+                      input_channels +
+                  c_in;
+      if (fuse_relu_before_conv) {
+        if (input_data[index] <= 0) {
+          input_grad_data[index] = 0;
+          continue;
+        }
+      }
+
+      for (int c_i = 0; c_i < filter_multiplier; c_i++) {
+        int c_out = c_in * filter_multiplier + c_i;
+        int weight_offset = filter_height * filter_width;
+        for (int h_out = h_out_start, h_f = 0; h_f < filter_height;
+             h_out += dilate_height, h_f++) {
+          for (int w_out = w_out_start, w_f = 0; w_f < filter_width;
+               w_out += dilate_width, w_f++) {
+            weight_offset--;
+            int s_h_out = h_out / stride_height;
+            int s_w_out = w_out / stride_width;
+            if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
+                s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
+                s_w_out < output_width) {
+              int output_grad_offset =
+                  ((batch * output_height + s_h_out) * output_width + s_w_out) *
+                      output_channels +
+                  c_out;
+              int filter_offset = weight_offset * output_channels + c_out;
               value += output_grad_data[output_grad_offset] *
                        filter_data[filter_offset];
             }
@@ -362,10 +441,10 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad(
 
 template <typename T, int c_filter, int c_filter_multiplier,
           bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
+__device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
-  const int kWeghtSize = c_filter * c_filter * c_filter_multiplier + 1;
-  T r_weight[kWeghtSize];
+  const int kWeightSize = c_filter * c_filter * c_filter_multiplier + 1;
+  T r_weight[kWeightSize];
   const int batch = blockIdx.y;
   const int c_in = blockIdx.x;
 
@@ -379,24 +458,13 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
 
   for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
     for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
-      const int batch = blockIdx.y;
-      const int c_in = blockIdx.x;
-
       int h_out_start = h_in - (c_filter - 1) * dilate_height + padding_height;
-
       int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width;
 
       T value = 0;
-      int index;
-      if (data_layout != DataLayout::kNHWC) {
-        index =
-            ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
-            w_in;
-      } else {
-        index =
-            ((batch * input_height + h_in) * input_width + w_in) * gridDim.x +
-            c_in;
-      }
+      int index =
+          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
+          w_in;
       if (fuse_relu_before_conv) {
         if (input_data[index] <= 0) {
           input_grad_data[index] = 0;
@@ -415,20 +483,11 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
             if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
                 s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
                 s_w_out < output_width) {
-              int output_grad_offset;
-              if (data_layout != DataLayout::kNHWC) {
-                output_grad_offset =
-                    ((batch * output_channels + c_out) * output_height +
-                     s_h_out) *
-                        output_width +
-                    s_w_out;
-              } else {
-                output_grad_offset =
-                    ((batch * output_height + s_h_out) * output_width +
-                     s_w_out) *
-                        output_channels +
-                    c_out;
-              }
+              int output_grad_offset =
+                  ((batch * output_channels + c_out) * output_height +
+                   s_h_out) *
+                      output_width +
+                  s_w_out;
               value +=
                   output_grad_data[output_grad_offset] *
                   r_weight[h_f * c_filter + w_f + c_i * c_filter * c_filter];
@@ -441,47 +500,137 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
+template <typename T, int c_filter, int c_filter_multiplier,
           bool fuse_relu_before_conv>
+__device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC(
+    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
+  int h_in = blockIdx.x * dilate_height + blockIdx.y;
+  if (h_in >= input_height) {
+    return;
+  }
+  const int kWeightSize = c_filter * c_filter * c_filter_multiplier + 1;
+  T r_weight[kWeightSize];
+  const int batch = blockIdx.z;
+  const int wi_size = (input_width + dilate_width - 1) / dilate_width;
+  const int h_out_start =
+      h_in - (c_filter - 1) * dilate_height + padding_height;
+
+  for (int c_in = threadIdx.x; c_in < input_channels; c_in += blockDim.x) {
+    for (int c_i = 0; c_i < c_filter_multiplier; c_i++) {
+      int c_out = c_in * c_filter_multiplier + c_i;
+      for (int i = 0; i < c_filter * c_filter; i++)
+        r_weight[i + c_i * c_filter * c_filter] =
+            filter_data[(c_filter * c_filter - i - 1) * output_channels +
+                        c_out];
+    }
+    for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) {
+      int i_dw = i / wi_size;
+      int i_wi = i - i_dw * wi_size;
+      int w_in = i_wi * dilate_width + i_dw;
+      if (w_in >= input_width) {
+        continue;
+      }
+      int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width;
+
+      T value = 0;
+      int index = ((batch * input_height + h_in) * input_width + w_in) *
+                      input_channels +
+                  c_in;
+      if (fuse_relu_before_conv) {
+        if (input_data[index] <= 0) {
+          input_grad_data[index] = 0;
+          continue;
+        }
+      }
+
+      for (int c_i = 0; c_i < c_filter_multiplier; c_i++) {
+        int c_out = c_in * c_filter_multiplier + c_i;
+        for (int h_out = h_out_start, h_f = 0; h_f < c_filter;
+             h_out += dilate_height, h_f++) {
+          for (int w_out = w_out_start, w_f = 0; w_f < c_filter;
+               w_out += dilate_width, w_f++) {
+            int s_h_out = h_out / stride_height;
+            int s_w_out = w_out / stride_width;
+            if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
+                s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
+                s_w_out < output_width) {
+              int output_grad_offset =
+                  ((batch * output_height + s_h_out) * output_width + s_w_out) *
+                      output_channels +
+                  c_out;
+              value +=
+                  output_grad_data[output_grad_offset] *
+                  r_weight[h_f * c_filter + w_f + c_i * c_filter * c_filter];
+            }
+          }
+        }
+      }
+      input_grad_data[index] = value;
+    }
+  }
+}
+
+template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
+          DataLayout data_layout, bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvInputGradSp(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
-  if (c_filter_multiplier == 0)
-    KernelDepthwiseConvInputGrad<T, fuse_relu_before_conv>(
-        input_data, output_grad_data, filter_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        filter_multiplier, filter_height, filter_width, stride_height,
-        stride_width, padding_height, padding_width, dilate_height,
-        dilate_width, input_grad_data, data_layout);
-  else if (c_filter == -1)
-    KernelDepthwiseConvInputGrad<T, fuse_relu_before_conv>(
-        input_data, output_grad_data, filter_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
-        padding_height, padding_width, dilate_height, dilate_width,
-        input_grad_data, data_layout);
-  else
-    KernelDepthwiseConvInputGradCFilter<T, c_filter, c_filter_multiplier,
-                                        fuse_relu_before_conv>(
-        input_data, output_grad_data, filter_data, batch_size, output_channels,
-        output_height, output_width, input_channels, input_height, input_width,
-        c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
-        padding_height, padding_width, dilate_height, dilate_width,
-        input_grad_data, data_layout);
+  int final_filter_multiplier = filter_multiplier;
+  int h_stride = stride_height;
+  int w_stride = stride_width;
+  if (c_filter_multiplier != 0) {
+    final_filter_multiplier = c_filter_multiplier;
+    h_stride = c_stride;
+    w_stride = c_stride;
+  }
+
+  if (c_filter_multiplier == 0 || c_filter == -1) {
+    if (data_layout != DataLayout::kNHWC) {
+      KernelDepthwiseConvInputGradNCHW<T, fuse_relu_before_conv>(
+          input_data, output_grad_data, filter_data, batch_size,
+          output_channels, output_height, output_width, input_channels,
+          input_height, input_width, final_filter_multiplier, filter_height,
+          filter_width, h_stride, w_stride, padding_height, padding_width,
+          dilate_height, dilate_width, input_grad_data);
+    } else {
+      KernelDepthwiseConvInputGradNHWC<T, fuse_relu_before_conv>(
+          input_data, output_grad_data, filter_data, batch_size,
+          output_channels, output_height, output_width, input_channels,
+          input_height, input_width, final_filter_multiplier, filter_height,
+          filter_width, h_stride, w_stride, padding_height, padding_width,
+          dilate_height, dilate_width, input_grad_data);
+    }
+  } else {
+    if (data_layout != DataLayout::kNHWC) {
+      KernelDepthwiseConvInputGradCFilterNCHW<T, c_filter, c_filter_multiplier,
+                                              fuse_relu_before_conv>(
+          input_data, output_grad_data, filter_data, batch_size,
+          output_channels, output_height, output_width, input_channels,
+          input_height, input_width, c_filter_multiplier, filter_height,
+          filter_width, c_stride, c_stride, padding_height, padding_width,
+          dilate_height, dilate_width, input_grad_data);
+    } else {
+      KernelDepthwiseConvInputGradCFilterNHWC<T, c_filter, c_filter_multiplier,
+                                              fuse_relu_before_conv>(
+          input_data, output_grad_data, filter_data, batch_size,
+          output_channels, output_height, output_width, input_channels,
+          input_height, input_width, c_filter_multiplier, filter_height,
+          filter_width, c_stride, c_stride, padding_height, padding_width,
+          dilate_height, dilate_width, input_grad_data);
+    }
+  }
 }
 
 // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
 template <typename T, bool fuse_relu_before_conv>
-__device__ __inline__ void KernelDepthwiseConvFilterGrad(
+__device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
     const T* output_grad_data, const T* input_data, const int num,
     const int output_channels, const int output_height, const int output_width,
     const int input_channels, const int input_height, const int input_width,
     const int filter_multiplier, const int filter_height,
     const int filter_width, const int stride_height, const int stride_width,
     const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data,
-    const DataLayout data_layout = DataLayout::kNCHW) {
+    const int dilate_width, T* filter_grad_data) {
   T s = 0;
-
   int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
 
   for (int image_w = threadIdx.x; image_w < output_width;
@@ -499,45 +648,137 @@ __device__ __inline__ void KernelDepthwiseConvFilterGrad(
         if (image_wk < 0 || image_wk >= input_width) continue;
 #define gaid(N, C, H, W) \
   ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
-#define gaid_nhwc(N, H, W, C) \
-  ((((N)*output_height + (H)) * output_width + (W)) * gridDim.z + (C))
-        int input_id;
-        if (data_layout != DataLayout::kNHWC) {
-          input_id = ((bid * (gridDim.z / filter_multiplier) +
-                       kernel_id / filter_multiplier) *
-                          input_height +
-                      image_hk) *
-                         input_width +
-                     image_wk;
-          if (fuse_relu_before_conv) {
-            s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
-                 max(0.0f, input_data[input_id]);
-          } else {
-            s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
-                 input_data[input_id];
-          }
+        int input_id = ((bid * (gridDim.z / filter_multiplier) +
+                         kernel_id / filter_multiplier) *
+                            input_height +
+                        image_hk) *
+                           input_width +
+                       image_wk;
+        if (fuse_relu_before_conv) {
+          s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
+               max(0.0f, input_data[input_id]);
         } else {
-          input_id =
+          s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
+               input_data[input_id];
+        }
+#undef gaid
+      }
+    }
+  }
+  CudaAtomicAddWithWarp(&filter_grad_data[gbid], s);
+}
+
+template <typename T, bool fuse_relu_before_conv>
+__device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
+    const T* output_grad_data, const T* input_data, const int num,
+    const int output_channels, const int output_height, const int output_width,
+    const int input_channels, const int input_height, const int input_width,
+    const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, const int dilate_height,
+    const int dilate_width, T* filter_grad_data) {
+  int bid = blockIdx.z;
+  int image_h = blockIdx.y;
+  int kernel_iw = blockIdx.x % filter_width;
+  int kernel_ih = blockIdx.x / filter_width;
+  for (int kernel_id = threadIdx.x; kernel_id < output_channels;
+       kernel_id += blockDim.x) {
+    T s = 0;
+    int gbid =
+        ((kernel_id * filter_height) + kernel_ih) * filter_width + kernel_iw;
+    for (int image_w = threadIdx.y; image_w < output_width;
+         image_w += blockDim.y) {
+      int kernel_h = kernel_ih * dilate_height - padding_height;
+      int kernel_w = kernel_iw * dilate_width - padding_width;
+
+      int image_hk = image_h * stride_height + kernel_h;
+      int image_wk = image_w * stride_width + kernel_w;
+      if (image_hk < 0 || image_hk >= input_height) continue;
+      if (image_wk < 0 || image_wk >= input_width) continue;
+#define gaid(N, H, W, C) \
+  ((((N)*output_height + (H)) * output_width + (W)) * output_channels + (C))
+      int input_id =
+          ((bid * input_height + image_hk) * input_width + image_wk) *
+              input_channels +
+          kernel_id / filter_multiplier;
+      if (fuse_relu_before_conv) {
+        s += output_grad_data[gaid(bid, image_h, image_w, kernel_id)] *
+             max(0.0f, input_data[input_id]);
+      } else {
+        s += output_grad_data[gaid(bid, image_h, image_w, kernel_id)] *
+             input_data[input_id];
+      }
+#undef gaid
+    }
+    platform::CudaAtomicAdd(&filter_grad_data[gbid], s);
+  }
+}
+
+template <typename T, int c_filter, bool fuse_relu_before_conv>
+__device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
+    const T* output_grad_data, const T* input_data, const int num,
+    const int output_channels, const int output_height, const int output_width,
+    const int input_channels, const int input_height, const int input_width,
+    const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, const int dilate_height,
+    const int dilate_width, T* filter_grad_data) {
+  const int bid = blockIdx.z;
+  int image_h = blockIdx.x * dilate_height + blockIdx.y;
+  if (image_h >= output_height) {
+    return;
+  }
+  const int kWeightSize = c_filter * c_filter;
+  T r_weight[kWeightSize];
+  const int wi_size = (output_width + dilate_width - 1) / dilate_width;
+
+  for (int kernel_id = threadIdx.x; kernel_id < output_channels;
+       kernel_id += blockDim.x) {
+    for (int i = 0; i < c_filter * c_filter; ++i) {
+      r_weight[i] = 0;
+    }
+    for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) {
+      int i_dw = i / wi_size;
+      int i_wi = i - i_dw * wi_size;
+      int image_w = i_wi * dilate_width + i_dw;
+      if (image_w >= output_width) {
+        continue;
+      }
+      for (int kernel_ih = 0; kernel_ih < c_filter; ++kernel_ih) {
+        for (int kernel_iw = 0; kernel_iw < c_filter; ++kernel_iw) {
+          int kernel_h = kernel_ih * dilate_height - padding_height;
+          int kernel_w = kernel_iw * dilate_width - padding_width;
+          int image_hk = image_h * stride_height + kernel_h;
+          int image_wk = image_w * stride_width + kernel_w;
+          if (image_hk < 0 || image_hk >= input_height) continue;
+          if (image_wk < 0 || image_wk >= input_width) continue;
+          int input_id =
               ((bid * input_height + image_hk) * input_width + image_wk) *
-                  (gridDim.z / filter_multiplier) +
+                  input_channels +
               kernel_id / filter_multiplier;
+          int output_id =
+              ((bid * output_height + image_h) * output_width + image_w) *
+                  output_channels +
+              kernel_id;
+          T s = 0;
           if (fuse_relu_before_conv) {
-            s += output_grad_data[gaid_nhwc(bid, image_h, image_w, kernel_id)] *
-                 max(0.0f, input_data[input_id]);
+            s = output_grad_data[output_id] * max(0.0f, input_data[input_id]);
           } else {
-            s += output_grad_data[gaid_nhwc(bid, image_h, image_w, kernel_id)] *
-                 input_data[input_id];
+            s = output_grad_data[output_id] * input_data[input_id];
           }
+          r_weight[kernel_ih * c_filter + kernel_iw] += s;
         }
-
-#undef gaid
       }
     }
+    for (int i = 0; i < c_filter * c_filter; ++i) {
+      T* weight = filter_grad_data + i * output_channels + kernel_id;
+      platform::CudaAtomicAdd(&weight[0], r_weight[i]);
+    }
   }
-  CudaAtomicAddWithWarp(&filter_grad_data[gbid], s);
 }
 
-template <typename T, int c_filter_multiplier, bool fuse_relu_before_conv>
+template <typename T, int c_filter_multiplier, int c_stride, int c_filter,
+          DataLayout data_layout, bool fuse_relu_before_conv>
 __global__ void KernelDepthwiseConvFilterGradSp(
     const T* output_grad_data, const T* input_data, const int num,
     const int output_channels, const int output_height, const int output_width,
@@ -545,22 +786,49 @@ __global__ void KernelDepthwiseConvFilterGradSp(
     const int filter_multiplier, const int filter_height,
     const int filter_width, const int stride_height, const int stride_width,
     const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* filter_grad_data,
-    const DataLayout data_layout = DataLayout::kNCHW) {
-  if (c_filter_multiplier == 0)
-    KernelDepthwiseConvFilterGrad<T, fuse_relu_before_conv>(
-        output_grad_data, input_data, num, output_channels, output_height,
-        output_width, input_channels, input_height, input_width,
-        filter_multiplier, filter_height, filter_width, stride_height,
-        stride_width, padding_height, padding_width, dilate_height,
-        dilate_width, filter_grad_data, data_layout);
-  else
-    KernelDepthwiseConvFilterGrad<T, fuse_relu_before_conv>(
-        output_grad_data, input_data, num, output_channels, output_height,
-        output_width, input_channels, input_height, input_width,
-        c_filter_multiplier, filter_height, filter_width, stride_height,
-        stride_width, padding_height, padding_width, dilate_height,
-        dilate_width, filter_grad_data, data_layout);
+    const int dilate_width, T* filter_grad_data) {
+  int final_filter_multiplier = filter_multiplier;
+  int h_stride = stride_height;
+  int w_stride = stride_width;
+  if (c_filter_multiplier != 0) {
+    final_filter_multiplier = c_filter_multiplier;
+    h_stride = c_stride;
+    w_stride = c_stride;
+  }
+  if (c_filter_multiplier == 0 || c_filter == -1) {
+    if (data_layout != DataLayout::kNHWC) {
+      KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>(
+          output_grad_data, input_data, num, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          filter_grad_data);
+    } else {
+      KernelDepthwiseConvFilterGradNHWC<T, fuse_relu_before_conv>(
+          output_grad_data, input_data, num, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          filter_grad_data);
+    }
+  } else {
+    if (data_layout != DataLayout::kNHWC) {
+      KernelDepthwiseConvFilterGradNCHW<T, fuse_relu_before_conv>(
+          output_grad_data, input_data, num, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          filter_grad_data);
+    } else {
+      KernelDepthwiseConvFilterGradCFilterNHWC<T, c_filter,
+                                               fuse_relu_before_conv>(
+          output_grad_data, input_data, num, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          final_filter_multiplier, filter_height, filter_width, h_stride,
+          w_stride, padding_height, padding_width, dilate_height, dilate_width,
+          filter_grad_data);
+    }
+  }
 }
 
 /*
@@ -608,40 +876,86 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
     const T* filter_data = filter.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
+    framework::Tensor filter_hwc;
+    if (data_layout == DataLayout::kNHWC) {
+      framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3],
+                                       filter.dims()[0], filter.dims()[1]});
+      filter_hwc.Resize(filter_hwc_dims);
+      filter_hwc.mutable_data<T>(context.GetPlace());
+      std::vector<int> perm_axis({2, 3, 0, 1});
+      math::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      trans(context, filter, &filter_hwc, perm_axis);
+      filter_data = filter_hwc.data<T>();
+    }
+
     int thread = 512;
-    if (output_width > 1024 && output_width <= 2048)
-      thread = (output_width - 1) / 2 + 1;
-    else if (output_width > 512 && output_width <= 1024)
-      thread = output_width;
-    int blocks = std::min(std::max(thread / output_width, 1), output_height);
-    dim3 threads(std::min(output_width, thread), blocks, 1);
-    dim3 grid(output_channels, batch_size, 1);
+    int blocks;
+    dim3 threads;
+    dim3 grid;
+    if (data_layout != DataLayout::kNHWC) {
+      if (output_width > 1024 && output_width <= 2048)
+        thread = (output_width - 1) / 2 + 1;
+      else if (output_width > 512 && output_width <= 1024)
+        thread = output_width;
+#ifdef __HIPCC__
+      thread = std::min(thread, 256);
+#endif
+      blocks = std::min(std::max(thread / output_width, 1), output_height);
+      threads = dim3(std::min(output_width, thread), blocks, 1);
+      grid = dim3(output_channels, batch_size, 1);
+    } else {
+#ifdef __HIPCC__
+      thread = std::min(thread, 256);
+#endif
+      blocks = std::min(
+          std::max(thread / output_channels, 1),
+          ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
+      threads = dim3(std::min(output_channels, thread), blocks, 1);
+      grid = dim3((output_height + dilate_height - 1) / dilate_height,
+                  dilate_height, batch_size);
+    }
     int filter_multiplier = output_channels / input_channels;
-
     int nums_output =
         batch_size * output_channels * output_height * output_width;
+#ifdef __HIPCC__
+    int block_size = 256;
+    int grid_size = std::min((nums_output + block_size - 1) / block_size, 256);
+#else
     int block_size = 512;
+    int grid_size = (nums_output + block_size - 1) / block_size;
+#endif
 
-#define check_case(c_filter_multiplier, c_stride, c_filter)                  \
-  if (c_filter_multiplier == 0 ||                                            \
-      filter_multiplier == c_filter_multiplier &&                            \
-          stride_height == stride_width && stride_height == c_stride &&      \
-          (ksize_height == ksize_width && ksize_height == c_filter ||        \
-           c_filter == -1)) {                                                \
-    if (c_filter == -1) {                                                    \
-      threads.x = block_size;                                                \
-      grid.x = (nums_output + block_size - 1) / block_size;                  \
-      threads.y = threads.z = grid.y = grid.z = 1;                           \
-    }                                                                        \
-    KernelDepthwiseConvSp<                                                   \
-        T, c_filter_multiplier, c_stride, c_filter,                          \
-        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
-        input_data, filter_data, batch_size, output_channels, output_height, \
-        output_width, input_channels, input_height, input_width,             \
-        filter_multiplier, ksize_height, ksize_width, stride_height,         \
-        stride_width, padding_height, padding_width, dilate_height,          \
-        dilate_width, output_data, data_layout);                             \
-    return;                                                                  \
+#define check_case(c_filter_multiplier, c_stride, c_filter)                    \
+  if (c_filter_multiplier == 0 ||                                              \
+      filter_multiplier == c_filter_multiplier &&                              \
+          stride_height == stride_width && stride_height == c_stride &&        \
+          (ksize_height == ksize_width && ksize_height == c_filter ||          \
+           c_filter == -1)) {                                                  \
+    if (c_filter == -1) {                                                      \
+      threads.x = block_size;                                                  \
+      grid.x = grid_size;                                                      \
+      threads.y = threads.z = grid.y = grid.z = 1;                             \
+    }                                                                          \
+    if (data_layout != DataLayout::kNHWC) {                                    \
+      KernelDepthwiseConvSp<                                                   \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,       \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
+          input_data, filter_data, batch_size, output_channels, output_height, \
+          output_width, input_channels, input_height, input_width,             \
+          filter_multiplier, ksize_height, ksize_width, stride_height,         \
+          stride_width, padding_height, padding_width, dilate_height,          \
+          dilate_width, output_data);                                          \
+    } else {                                                                   \
+      KernelDepthwiseConvSp<                                                   \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,       \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
+          input_data, filter_data, batch_size, output_channels, output_height, \
+          output_width, input_channels, input_height, input_width,             \
+          filter_multiplier, ksize_height, ksize_width, stride_height,         \
+          stride_width, padding_height, padding_width, dilate_height,          \
+          dilate_width, output_data);                                          \
+    }                                                                          \
+    return;                                                                    \
   }
     check_case(1, 1, 3);
     check_case(1, 1, 5);
@@ -705,32 +1019,67 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
+    framework::Tensor filter_hwc;
+    if (data_layout == DataLayout::kNHWC) {
+      framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3],
+                                       filter.dims()[0], filter.dims()[1]});
+      filter_hwc.Resize(filter_hwc_dims);
+      filter_hwc.mutable_data<T>(context.GetPlace());
+      std::vector<int> perm_axis({2, 3, 0, 1});
+      math::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      trans(context, filter, &filter_hwc, perm_axis);
+      filter_data = filter_hwc.data<T>();
+    }
+
     int thread = 512;
-    if (input_width > 1024 && input_width <= 2048)
-      thread = (input_width - 1) / 2 + 1;
-    else if (input_width > 512 && input_width <= 1024)
-      thread = input_width;
-    int blocks = std::min(std::max(thread / input_width, 1), input_height);
-    dim3 threads(std::min(input_width, thread), blocks, 1);
-    dim3 grid(input_channels, batch_size, 1);
+    int blocks;
+    dim3 threads;
+    dim3 grid;
+    if (data_layout != DataLayout::kNHWC) {
+      if (input_width > 1024 && input_width <= 2048) {
+        thread = (input_width - 1) / 2 + 1;
+      } else if (input_width > 512 && input_width <= 1024) {
+        thread = input_width;
+      }
+      blocks = std::min(std::max(thread / input_width, 1), input_height);
+      threads = dim3(std::min(input_width, thread), blocks, 1);
+      grid = dim3(input_channels, batch_size, 1);
+    } else {
+      blocks = std::min(
+          std::max(thread / input_channels, 1),
+          ((input_width + dilate_width - 1) / dilate_width) * dilate_width);
+      threads = dim3(std::min(input_channels, thread), blocks, 1);
+      grid = dim3((input_height + dilate_height - 1) / dilate_height,
+                  dilate_height, batch_size);
+    }
     int filter_multiplier = output_channels / input_channels;
 
-#define check_case(c_filter_multiplier, c_stride, c_filter)             \
-  if (c_filter_multiplier == 0 ||                                       \
-      filter_multiplier == c_filter_multiplier &&                       \
-          stride_height == stride_width && stride_height == c_stride && \
-          (ksize_height == ksize_width && ksize_height == c_filter ||   \
-           c_filter == -1)) {                                           \
-    KernelDepthwiseConvInputGradSp<                                     \
-        T, c_filter_multiplier, c_stride, c_filter,                     \
-        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
-        input_data, output_grad_data, filter_data, batch_size,          \
-        output_channels, output_height, output_width, input_channels,   \
-        input_height, input_width, filter_multiplier, ksize_height,     \
-        ksize_width, stride_height, stride_width, padding_height,       \
-        padding_width, dilate_height, dilate_width, input_grad_data,    \
-        data_layout);                                                   \
-    return;                                                             \
+#define check_case(c_filter_multiplier, c_stride, c_filter)               \
+  if (c_filter_multiplier == 0 ||                                         \
+      filter_multiplier == c_filter_multiplier &&                         \
+          stride_height == stride_width && stride_height == c_stride &&   \
+          (ksize_height == ksize_width && ksize_height == c_filter ||     \
+           c_filter == -1)) {                                             \
+    if (data_layout != DataLayout::kNHWC) {                               \
+      KernelDepthwiseConvInputGradSp<                                     \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,  \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
+          input_data, output_grad_data, filter_data, batch_size,          \
+          output_channels, output_height, output_width, input_channels,   \
+          input_height, input_width, filter_multiplier, ksize_height,     \
+          ksize_width, stride_height, stride_width, padding_height,       \
+          padding_width, dilate_height, dilate_width, input_grad_data);   \
+    } else {                                                              \
+      KernelDepthwiseConvInputGradSp<                                     \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,  \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>( \
+          input_data, output_grad_data, filter_data, batch_size,          \
+          output_channels, output_height, output_width, input_channels,   \
+          input_height, input_width, filter_multiplier, ksize_height,     \
+          ksize_width, stride_height, stride_width, padding_height,       \
+          padding_width, dilate_height, dilate_width, input_grad_data);   \
+    }                                                                     \
+    return;                                                               \
   }
     check_case(1, 1, 3);
     check_case(1, 1, 5);
@@ -793,30 +1142,95 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
     T* filter_grad_data = filter_grad->mutable_data<T>(context.GetPlace());
 
     int block_size = 512;
-    if (output_width > 1024 && output_width <= 2048)
-      block_size = (output_width - 1) / 2 + 1;
-    else if (output_width > 512 && output_width <= 1024)
-      block_size = output_width;
-    int crop_output_height =
-        std::min(std::max(block_size / output_width, 1), output_height);
-    dim3 grid(ksize_width, ksize_height, output_channels);
-    dim3 threads(std::min(output_width, block_size), crop_output_height, 1);
+    int blocks;
+    dim3 threads;
+    dim3 grid;
+    if (data_layout != DataLayout::kNHWC) {
+      if (output_width > 1024 && output_width <= 2048) {
+        block_size = (output_width - 1) / 2 + 1;
+      } else if (output_width > 512 && output_width <= 1024) {
+        block_size = output_width;
+      }
+      blocks = std::min(std::max(block_size / output_width, 1), output_height);
+      grid = dim3(ksize_width, ksize_height, output_channels);
+      threads = dim3(std::min(output_width, block_size), blocks, 1);
+    } else {
+      blocks = std::min(
+          std::max(block_size / output_channels, 1),
+          ((output_width + dilate_width - 1) / dilate_width) * dilate_width);
+      grid = dim3((output_height + dilate_height - 1) / dilate_height,
+                  dilate_height, batch_size);
+      threads = dim3(std::min(output_channels, block_size), blocks, 1);
+    }
     int filter_multiplier = output_channels / input_channels;
 
-#define check_case(c_filter_multiplier)                                       \
-  if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \
-    KernelDepthwiseConvFilterGradSp<                                          \
-        T, c_filter_multiplier,                                               \
-        fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(       \
-        output_grad_data, input_data, batch_size, output_channels,            \
-        output_height, output_width, input_channels, input_height,            \
-        input_width, filter_multiplier, ksize_height, ksize_width,            \
-        stride_height, stride_width, padding_height, padding_width,           \
-        dilate_height, dilate_width, filter_grad_data, data_layout);          \
-    return;                                                                   \
+#define check_case(c_filter_multiplier, c_stride, c_filter)                    \
+  if (c_filter_multiplier == 0 ||                                              \
+      filter_multiplier == c_filter_multiplier &&                              \
+          stride_height == stride_width && stride_height == c_stride &&        \
+          (ksize_height == ksize_width && ksize_height == c_filter ||          \
+           c_filter == -1)) {                                                  \
+    if (data_layout != DataLayout::kNHWC) {                                    \
+      KernelDepthwiseConvFilterGradSp<                                         \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW,       \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
+          output_grad_data, input_data, batch_size, output_channels,           \
+          output_height, output_width, input_channels, input_height,           \
+          input_width, filter_multiplier, ksize_height, ksize_width,           \
+          stride_height, stride_width, padding_height, padding_width,          \
+          dilate_height, dilate_width, filter_grad_data);                      \
+    } else {                                                                   \
+      framework::Tensor filter_grad_hwc;                                       \
+      if (c_filter != -1) {                                                    \
+        framework::DDim filter_grad_hwc_dims(                                  \
+            {filter_grad->dims()[2], filter_grad->dims()[3],                   \
+             filter_grad->dims()[0], filter_grad->dims()[1]});                 \
+        filter_grad_hwc.Resize(filter_grad_hwc_dims);                          \
+        filter_grad_hwc.mutable_data<T>(context.GetPlace());                   \
+        math::SetConstant<platform::CUDADeviceContext, T> set_zero;            \
+        set_zero(context, &filter_grad_hwc, static_cast<T>(0));                \
+        filter_grad_data = filter_grad_hwc.data<T>();                          \
+      } else {                                                                 \
+        block_size = 512;                                                      \
+        if (output_channels > 1024 && output_channels <= 2048) {               \
+          block_size = (output_channels - 1) / 2 + 1;                          \
+        } else if (output_channels > 512 && output_channels <= 1024) {         \
+          block_size = output_channels;                                        \
+        }                                                                      \
+        blocks =                                                               \
+            std::min(std::max(block_size / output_channels, 1), output_width); \
+        grid = dim3(ksize_width * ksize_height, output_height, batch_size);    \
+        threads = dim3(std::min(output_channels, block_size), blocks, 1);      \
+      }                                                                        \
+      KernelDepthwiseConvFilterGradSp<                                         \
+          T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC,       \
+          fuse_relu_before_conv><<<grid, threads, 0, context.stream()>>>(      \
+          output_grad_data, input_data, batch_size, output_channels,           \
+          output_height, output_width, input_channels, input_height,           \
+          input_width, filter_multiplier, ksize_height, ksize_width,           \
+          stride_height, stride_width, padding_height, padding_width,          \
+          dilate_height, dilate_width, filter_grad_data);                      \
+      if (c_filter != -1) {                                                    \
+        std::vector<int> perm_axis({2, 3, 0, 1});                              \
+        math::TransposeNormal<platform::CUDADeviceContext, T> trans;           \
+        trans(context, filter_grad_hwc, filter_grad, perm_axis);               \
+      }                                                                        \
+    }                                                                          \
+    return;                                                                    \
   }
-    check_case(1);
-    check_case(0);
+    check_case(1, 1, 3);
+    check_case(1, 1, 5);
+    check_case(1, 1, -1);
+    check_case(1, 2, 3);
+    check_case(1, 2, 5);
+    check_case(1, 2, -1);
+    check_case(2, 1, 3);
+    check_case(2, 1, 5);
+    check_case(2, 1, -1);
+    check_case(2, 2, 3);
+    check_case(2, 2, 5);
+    check_case(2, 2, -1);
+    check_case(0, 0, -1);
 #undef check_case
   }
 };
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index a61b50faa757cf..5242d03c11c997 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -51,6 +51,7 @@ template struct SetConstant<platform::CPUDeviceContext, platform::complex128>;
 template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
 template struct SetConstant<platform::XPUDeviceContext, float>;
 template struct SetConstant<platform::XPUDeviceContext, double>;
+template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
 template struct SetConstant<platform::XPUDeviceContext, int>;
 template struct SetConstant<platform::XPUDeviceContext, int64_t>;
 template struct SetConstant<platform::XPUDeviceContext, bool>;
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index cc8925fcf8aeee..2b93cd926081ec 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -35,6 +35,7 @@ using complex128 = paddle::platform::complex128;
 template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
 template struct SetConstant<platform::CUDADeviceContext, float>;
 template struct SetConstant<platform::CUDADeviceContext, double>;
+template struct SetConstant<platform::CUDADeviceContext, uint8_t>;
 template struct SetConstant<platform::CUDADeviceContext, int>;
 template struct SetConstant<platform::CUDADeviceContext, int64_t>;
 template struct SetConstant<platform::CUDADeviceContext, bool>;
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index 879e367281c0a3..9e9fe5b9c1020d 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -54,10 +54,11 @@ void SoftmaxCUDNNFunctor<T>::operator()(
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
   miopenTensorDescriptor_t cudnn_y_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward(
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
       context.cudnn_handle(), CudnnDataType<T>::kOne(), cudnn_x_desc,
       X->data<T>(), CudnnDataType<T>::kZero(), cudnn_y_desc,
-      Y->mutable_data<T>(context.GetPlace())));
+      Y->mutable_data<T>(context.GetPlace()), MIOPEN_SOFTMAX_ACCURATE,
+      MIOPEN_SOFTMAX_MODE_INSTANCE));
 #else
   cudnnTensorDescriptor_t cudnn_x_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
@@ -96,11 +97,12 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
       dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
   miopenTensorDescriptor_t cudnn_ygrad_desc =
       dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward(
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
       context.cudnn_handle(), CudnnDataType<T>::kOne(), cudnn_y_desc,
       Y->data<T>(), cudnn_ygrad_desc, YGrad->data<T>(),
       CudnnDataType<T>::kZero(), cudnn_xgrad_desc,
-      XGrad->mutable_data<T>(context.GetPlace())));
+      XGrad->mutable_data<T>(context.GetPlace()), MIOPEN_SOFTMAX_ACCURATE,
+      MIOPEN_SOFTMAX_MODE_INSTANCE));
 #else
   cudnnTensorDescriptor_t cudnn_y_desc =
       yDesc.descriptor<T>(layout, cudnn_tensor_dims);
diff --git a/paddle/fluid/operators/meshgrid_op.cc b/paddle/fluid/operators/meshgrid_op.cc
index 12b255329da2d5..33f71b4adc066f 100644
--- a/paddle/fluid/operators/meshgrid_op.cc
+++ b/paddle/fluid/operators/meshgrid_op.cc
@@ -108,7 +108,9 @@ class MeshgridGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Out")).size(), 1,
                       platform::errors::InvalidArgument(
-                          "Number of Inputs(Out@Grad) must be larger than 1"));
+                          "Number of Inputs(Out@Grad) should be larger than 1."
+                          "But received Inputs(Out@Grad)' size = %d .",
+                          ctx->Inputs(framework::GradVarName("Out")).size()));
     ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
   }
 
diff --git a/paddle/fluid/operators/meshgrid_op.h b/paddle/fluid/operators/meshgrid_op.h
index 11cd43b22045c3..345e007de4a297 100644
--- a/paddle/fluid/operators/meshgrid_op.h
+++ b/paddle/fluid/operators/meshgrid_op.h
@@ -25,6 +25,7 @@
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/platform/errors.h"
 
 #define MAX_RANK_SUPPORTED 6
@@ -60,7 +61,8 @@ class MeshgridKernel : public framework::OpKernel<T> {
       REP_MESHGRID_TEMPLATE(MAX_RANK_SUPPORTED)
       default:
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "Only support tensor nums between 1 and 6."));
+            "Excepted Tensor numbers between 1 and 6, but only received d% .",
+            rank));
     }
   }
 
@@ -71,7 +73,9 @@ class MeshgridKernel : public framework::OpKernel<T> {
     auto outs = context.MultiOutput<framework::Tensor>("Out");
     PADDLE_ENFORCE_EQ(
         ins.size() > 1, true,
-        platform::errors::InvalidArgument("expect at least 2 input tensors"));
+        platform::errors::InvalidArgument(
+            "Expected at least 2 input tensors, but only received d%.",
+            ins.size()));
 
     int64_t size = ins.size();
     std::vector<int64_t> shape(size);
@@ -103,19 +107,21 @@ class MeshgridKernel : public framework::OpKernel<T> {
       reshape_ins_tensor.Resize(out_dims_reshape);
       framework::DDim out_dims = framework::make_ddim(shape);
 
-      Eigen::DSizes<int, Rank> bcast_dims;
+      Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
       for (int64_t j = 0; j < size; j++) {
         bcast_dims[j] = shape[j];
       }
       bcast_dims[i] = 1;
 
       outs[i]->Resize(out_dims);
-      auto x = framework::EigenTensor<T, Rank>::From(reshape_ins_tensor);
+      auto x = framework::EigenTensor<T, Rank>::From(
+          static_cast<const framework::Tensor>(reshape_ins_tensor));
       outs[i]->mutable_data<T>(context.GetPlace());
       auto y = framework::EigenTensor<T, Rank>::From(*outs[i]);
       auto& place =
           *context.template device_context<DeviceContext>().eigen_device();
-      y.device(place) = x.broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                   bcast_dims);
     }
   }
 };
@@ -131,7 +137,8 @@ class MeshgridGradKernel : public framework::OpKernel<T> {
       REP_MESHGRID_GRAD_TEMPLATE(MAX_RANK_SUPPORTED)
       default:
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "only support tensor nums being between 1 and 6."));
+            "Excepted Tensor numbers between 1 and 6, but only received d% .",
+            n));
     }
   }
 
@@ -165,21 +172,20 @@ class MeshgridGradKernel : public framework::OpKernel<T> {
         }
       }
 
-      Eigen::DSizes<int, Rank> reduce_dims;
+      Eigen::DSizes<Eigen::DenseIndex, Rank> reduce_dims;
       for (int k = 0; k < n; k++) {
         reduce_dims[k] = reduce_dims_vec[k];
       }
 
-      Eigen::DSizes<int, Rank * 2> reshape_dims;
+      Eigen::DSizes<Eigen::DenseIndex, Rank * 2> reshape_dims;
       for (int k = 0; k < n * 2; k++) {
         reshape_dims[k] = reshape_dims_vec[k];
       }
 
-      auto tensor_reduce_tmp =
-          out_grad_tmp.reshape(reshape_dims).sum(reduce_dims);
       auto& place =
           *context.template device_context<DeviceContext>().eigen_device();
-      in_grad.device(place) = tensor_reduce_tmp.reshape(in_grad.dimensions());
+      EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Rank>::Eval(
+          place, in_grad, out_grad_tmp, reduce_dims, reshape_dims);
     }
   }
 };
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index 8ceb22d8cc4c33..1bdb3728f538e2 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -20,6 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 #ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/miopen_helper.h"
 #endif
 
@@ -264,6 +266,34 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
     std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
     const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
 
+#ifdef PADDLE_WITH_HIP
+    if (pooling_type == "max") {
+      using OpKernelMap = paddle::framework::OperatorWithKernel::OpKernelMap;
+      using OpKernelFunc = paddle::framework::OperatorWithKernel::OpKernelFunc;
+      auto &all_op_kernels =
+          paddle::framework::OperatorWithKernel::AllOpKernels();
+      std::string op_type = "pool2d_grad";
+      auto kernels_iter = all_op_kernels.find(op_type);
+      PADDLE_ENFORCE_NE(
+          kernels_iter, all_op_kernels.end(),
+          platform::errors::Unavailable(
+              "There are no kernels which are registered in the %s operator.",
+              op_type));
+      OpKernelMap &kernels = kernels_iter->second;
+      paddle::framework::OpKernelType expected_kernel_key(
+          paddle::framework::ToDataType(typeid(T)), ctx.GetPlace());
+      auto kernel_iter = kernels.find(expected_kernel_key);
+      PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
+                        platform::errors::NotFound(
+                            "Operator (%s) does not have kernel for %s.",
+                            op_type, KernelTypeToString(expected_kernel_key)));
+      std::unique_ptr<OpKernelFunc> kernel_func_(
+          new OpKernelFunc(kernel_iter->second));
+      (*kernel_func_)(ctx);
+      return;
+    }
+#endif
+
     // update paddings
     auto in_x_dims = input->dims();
     framework::DDim data_dims;
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
index 159bdcabd657b0..277c93fad6aa83 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
@@ -119,6 +119,11 @@ class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                  "Output data type")
         .SetDefault(framework::proto::VarType::FP32);
 
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training.")
+        .SetDefault(false);
+
     AddComment(R"DOC(
 Lookup Tablel Prefetch Operator.
 This operator is used to perform lookup on parameter W,
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
index 0f1a096e207692..413b4ab358536c 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
@@ -30,6 +30,7 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
 
     auto padding_idx = context.Attr<int64_t>("padding_idx");
     auto table_id = context.Attr<int>("table_id");
+    bool is_test = context.Attr<bool>("is_test");
 
     auto embedding_name = context.InputNames("W").front();
     int64_t emb_dim = 0;
@@ -55,7 +56,8 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
     if (platform::is_cpu_place(context.GetPlace())) {
       fleet->PullSparseToTensorSync(static_cast<uint64_t>(table_id), emb_dim,
                                     static_cast<uint64_t>(padding_idx),
-                                    context.GetPlace(), &inputs, &outputs);
+                                    context.GetPlace(), !is_test, &inputs,
+                                    &outputs);
     } else {
       auto inputs_variable = context.MultiInputVar("Ids");
       auto outputs_variable = context.MultiOutputVar("Outputs");
@@ -93,7 +95,8 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
       // use fleet->PullSparse
       fleet->PullSparseToTensorSync(static_cast<uint64_t>(table_id), emb_dim,
                                     static_cast<uint64_t>(padding_idx),
-                                    cpu_place, &tmp_input_vec, &tmp_output_vec);
+                                    cpu_place, !is_test, &tmp_input_vec,
+                                    &tmp_output_vec);
 
       // cp temp to origin
       for (size_t idx = 0; idx < output_var_size; ++idx) {
diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h
index a793d12f522da5..5344147a9069cc 100644
--- a/paddle/fluid/operators/range_op.h
+++ b/paddle/fluid/operators/range_op.h
@@ -34,7 +34,7 @@ void GetSize(T start, T end, T step, int64_t* size) {
   if (start > end) {
     PADDLE_ENFORCE_LT(step, 0,
                       platform::errors::InvalidArgument(
-                          "step should be less than 0 while start > end."));
+                          "The step should be less than 0 while start > end."));
   }
 
   *size = std::is_integral<T>::value
diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h
index cfa88b9808d646..864a94a4235e65 100644
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/fluid/operators/scatter.h
@@ -102,9 +102,13 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
 
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
-    PADDLE_ENFORCE_EQ(src_dims[i], dst_dims[i],
-                      platform::errors::InvalidArgument(
-                          "src shape and dst shape should match"));
+    PADDLE_ENFORCE_EQ(
+        src_dims[i], dst_dims[i],
+        platform::errors::InvalidArgument(
+            "The dimensions of the source tensor and target tensor should"
+            " match, but received source tensor's %d-th dimension is %d,"
+            "target tensor's %d-th dimension is %d.",
+            i, src_dims[i], i, dst_dims[i]));
 
   // slice size
   size_t slice_size = 1;
@@ -146,9 +150,13 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
 
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
-    PADDLE_ENFORCE_EQ(src_dims[i], dst_dims[i],
-                      platform::errors::InvalidArgument(
-                          "src shape and dst shape should match"));
+    PADDLE_ENFORCE_EQ(
+        src_dims[i], dst_dims[i],
+        platform::errors::InvalidArgument(
+            "The dimensions of the source tensor and target tensor should"
+            " match, but received source tensor's %d-th dimension is %d,"
+            "target tensor's %d-th dimension is %d.",
+            i, src_dims[i], i, dst_dims[i]));
 
   // slice size
   size_t slice_size = 1;
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 2257d816d89218..140059256c3cc9 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -672,7 +672,11 @@ template <typename T>
 static void SoftmaxWithCrossEntropyFusedKernel(
     const T* logits_data, const T* labels_data, T* softmax_data, T* loss_data,
     int64_t n, int64_t d, int axis_dim, gpuStream_t stream) {
+#ifdef __HIPCC__
+  constexpr int kMaxBlockDim = 256;
+#else
   constexpr int kMaxBlockDim = 512;
+#endif
   int64_t block_dim = axis_dim >= kMaxBlockDim
                           ? kMaxBlockDim
                           : (1 << static_cast<int>(std::log2(axis_dim)));
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu.h b/paddle/fluid/operators/sync_batch_norm_op.cu.h
index d08a34ade77f28..69617b7e208a88 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cu.h
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h
@@ -187,12 +187,6 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx,
           x_d, N, H * W * D, C, stats);
     }
 
-    Tensor c_g_st;
-    auto *c_g_st_d = c_g_st.mutable_data<BatchNormParamType<T>>(
-        {2 * C + 1}, platform::CPUPlace());
-    auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-    memory::Copy(platform::CPUPlace(), c_g_st_d, gplace, stats, bytes, 0);
-
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto *comm = dev_ctx.nccl_comm();
     if (comm) {
diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h
index dffd3e58641770..4bbde8d08e06d8 100644
--- a/paddle/fluid/operators/tile_op.h
+++ b/paddle/fluid/operators/tile_op.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 #define MAX_RANK_SUPPORTED 6
 
@@ -155,7 +156,7 @@ class TileKernel : public framework::OpKernel<T> {
             "'repeat_times' for tile op must match after promotion.",
             vec_in_dims.size(), repeat_times.size()));
     auto* out0 = context.Output<Tensor>("Out");
-    Eigen::DSizes<int, Rank> bcast_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Rank> bcast_dims;
     for (size_t i = 0; i < repeat_times.size(); ++i) {
       bcast_dims[i] = repeat_times[i];
     }
@@ -175,9 +176,11 @@ class TileKernel : public framework::OpKernel<T> {
     // use 32-bit index to speed up
     bool use_32bit_index = y.size() < Eigen::NumTraits<int>::highest();
     if (use_32bit_index) {
-      To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(
+          place, To32BitIndex(y), To32BitIndex(x), bcast_dims);
     } else {
-      y.device(place) = x.broadcast(bcast_dims);
+      EigenBroadcast<std::decay_t<decltype(place)>, T, Rank>::Eval(place, y, x,
+                                                                   bcast_dims);
     }
   }
 };
@@ -255,21 +258,20 @@ class TileGradKernel : public framework::OpKernel<T> {
     auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
     out0->mutable_data<T>(context.GetPlace());
     auto x_grad = EigenVector<T>::Flatten(*out0);
-    Eigen::DSizes<int, Dims * 2> reshape_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims * 2> reshape_dims;
     for (size_t i = 0; i < reshape_size; ++i) {
       reshape_dims[i] = reshape_dims_vec[i];
     }
-    Eigen::DSizes<int, Dims> reduce_dims;
+    Eigen::DSizes<Eigen::DenseIndex, Dims> reduce_dims;
     for (size_t i = 0; i < reduce_size; ++i) {
       reduce_dims[i] = reduce_dims_vec[i];
     }
 
     auto out_grad = EigenVector<T>::Flatten(*in0);
-    x_grad.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        out_grad.reshape(reshape_dims)
-            .sum(reduce_dims)
-            .reshape(x_grad.dimensions());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenBroadcastGrad<std::decay_t<decltype(place)>, T, Dims>::Eval(
+        place, x_grad, out_grad, reduce_dims, reshape_dims);
   }
 };
 
diff --git a/paddle/fluid/operators/unstack_op.cc b/paddle/fluid/operators/unstack_op.cc
index 2f71f10a1c4177..71cc586cb598fd 100644
--- a/paddle/fluid/operators/unstack_op.cc
+++ b/paddle/fluid/operators/unstack_op.cc
@@ -101,14 +101,18 @@ class UnStackGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Y")).size(), 0,
                       platform::errors::InvalidArgument(
-                          "Number of Inputs(Y@Grad) must be larger than 0"));
+                          "The Inputs(Y@Grad) of unstack operator are empty."));
     OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", "X",
                    "UnStackGrad");
     auto input_dims = ctx->GetInputsDim(framework::GradVarName("Y"));
     for (size_t i = 1; i < input_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0],
-                        platform::errors::InvalidArgument(
-                            "Dims of all Inputs(Y@Grad) must be the same"));
+      PADDLE_ENFORCE_EQ(
+          input_dims[i], input_dims[0],
+          platform::errors::InvalidArgument(
+              "The dimensions of all Inputs(Y@Grad) must be the same,"
+              "but received Inputs(Y@Grad)'s %d-th dimension is %d, "
+              "Inputs(Y@Grad)'s 0-th to %d-th dimension is %d.",
+              i, input_dims[i], i - 1, input_dims[0]));
     }
 
     int axis = ctx->Attrs().Get<int>("axis");
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 47344f0e3733d6..1e16008f36bb77 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -10,6 +10,12 @@ ELSE()
   set(XPU_CTX_DEPS)
 endif(WITH_XPU)
 
+if(WITH_ASCEND)
+    set(ASCEND_DEPS xpulib)
+ELSE()
+  set(ASCEND_DEPS)
+endif(WITH_ASCEND)
+
 if (WITH_PYTHON)
   py_proto_compile(profiler_py_proto SRCS profiler.proto)
   add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
@@ -66,6 +72,10 @@ if(WITH_XPU)
 cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib)
 endif()
 
+if(WITH_ASCEND)
+    cc_library(ascend_npu_info SRCS ascend_npu_info.cc DEPS gflags glog enforce atlas_acl)
+endif()
+
 add_subdirectory(dynload)
 add_subdirectory(stream)
 
diff --git a/paddle/fluid/platform/ascend_npu_info.cc b/paddle/fluid/platform/ascend_npu_info.cc
new file mode 100644
index 00000000000000..db8dafeae1e893
--- /dev/null
+++ b/paddle/fluid/platform/ascend_npu_info.cc
@@ -0,0 +1,36 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/ascend_npu_info.h"
+#include <glog/logging.h>
+#include "acl/acl_rt.h"
+
+namespace paddle {
+namespace platform {
+namespace ascend {
+
+int NPUDevice::GetDeviceCount() {
+  uint32_t count = 0;
+  aclError status = aclrtGetDeviceCount(&count);
+  if (status != 0) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "aclrtGetDeviceCount error code: %d", status));
+    return -1;
+  }
+
+  return count;
+}
+
+}  // namespace ascend
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/ascend_npu_info.h b/paddle/fluid/platform/ascend_npu_info.h
new file mode 100644
index 00000000000000..7afed121a5acb6
--- /dev/null
+++ b/paddle/fluid/platform/ascend_npu_info.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND
+
+namespace paddle {
+namespace platform {
+namespace ascend {
+
+class NPUDevice {
+ public:
+  //! Get the total number of XPU devices in system.
+  static int GetDeviceCount();
+};
+
+}  // namespace ascend
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index 4f504b414de4a7..dde9531e591442 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -26,14 +26,10 @@ namespace platform {
 #ifdef PADDLE_WITH_HIP
 #define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate))
 #else
-#if CUDA_VERSION < 9000
-#define CREATE_SHFL_MASK(mask, predicate) mask = 0u;
-#else
 #define FULL_WARP_MASK 0xFFFFFFFF
 #define CREATE_SHFL_MASK(mask, predicate) \
   mask = __ballot_sync(FULL_WARP_MASK, (predicate))
 #endif
-#endif
 
 inline static int RoundToPowerOfTwo(int dim) {
   if (dim > 512) {
@@ -69,7 +65,7 @@ template <typename T>
 __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
                                                  int delta,
                                                  int width = warpSize) {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP)
   return __shfl_down(val, delta, width);
 #else
   return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
@@ -79,7 +75,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
 template <typename T>
 __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
                                                 int width = warpSize) {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP)
   return __shfl_xor(val, width);
 #else
   return __shfl_xor_sync(mask, val, width);
@@ -87,7 +83,7 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val,
 }
 
 // CUDA 9.0 have native compatible float16 shfl_down
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP)
 template <>
 __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                                        float16 val, int delta,
@@ -170,7 +166,7 @@ __forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync(
 template <typename T>
 __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
                                              int width = 32) {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000
+#if defined(PADDLE_WITH_HIP)
   return __shfl(val, src_line, width);
 #else
   return __shfl_sync(mask, val, src_line, width);
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
index fa4ef3f8c124e4..202be920c55953 100644
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -25,10 +25,6 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/macros.h"
 
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000
-enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 };
-#endif
-
 namespace paddle {
 namespace platform {
 
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index 340372007a77b0..94f64d158afbcb 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -200,6 +200,8 @@ CUDA_ATOMIC_WRAPPER(Max, float) {
 
     old = atomicCAS(address_as_i, assumed, __float_as_int(val));
   } while (assumed != old);
+
+  return __int_as_float(old);
 }
 
 CUDA_ATOMIC_WRAPPER(Max, double) {
@@ -219,6 +221,8 @@ CUDA_ATOMIC_WRAPPER(Max, double) {
 
     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
   } while (assumed != old);
+
+  return __longlong_as_double(old);
 }
 
 // For atomicMin
@@ -272,6 +276,8 @@ CUDA_ATOMIC_WRAPPER(Min, float) {
 
     old = atomicCAS(address_as_i, assumed, __float_as_int(val));
   } while (assumed != old);
+
+  return __int_as_float(old);
 }
 
 CUDA_ATOMIC_WRAPPER(Min, double) {
@@ -291,6 +297,8 @@ CUDA_ATOMIC_WRAPPER(Min, double) {
 
     old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
   } while (assumed != old);
+
+  return __longlong_as_double(old);
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc
index 4c59fe5e9bae4b..366762401c741e 100644
--- a/paddle/fluid/platform/dynload/cudnn.cc
+++ b/paddle/fluid/platform/dynload/cudnn.cc
@@ -24,26 +24,9 @@ void* cudnn_dso_handle = nullptr;
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
-CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
 
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R5
-CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R6
-CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP);
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DEFINE_WRAP);
 #endif
 
 #ifdef CUDNN_DNN_ROUTINE_EACH_R7
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index db84b8731f9ca4..f5045ff004ee9b 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -48,121 +48,93 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
  * include all needed cudnn functions in HPPL
  * different cudnn version has different interfaces
  **/
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnSetTensor4dDescriptor);                    \
-  __macro(cudnnSetTensor4dDescriptorEx);                  \
-  __macro(cudnnSetTensorNdDescriptor);                    \
-  __macro(cudnnGetTensorNdDescriptor);                    \
-  __macro(cudnnGetConvolutionNdForwardOutputDim);         \
-  __macro(cudnnCreateTensorDescriptor);                   \
-  __macro(cudnnDestroyTensorDescriptor);                  \
-  __macro(cudnnCreateFilterDescriptor);                   \
-  __macro(cudnnSetFilter4dDescriptor);                    \
-  __macro(cudnnSetFilterNdDescriptor);                    \
-  __macro(cudnnGetFilterNdDescriptor);                    \
-  __macro(cudnnSetPooling2dDescriptor);                   \
-  __macro(cudnnSetPoolingNdDescriptor);                   \
-  __macro(cudnnGetPoolingNdDescriptor);                   \
-  __macro(cudnnDestroyFilterDescriptor);                  \
-  __macro(cudnnCreateConvolutionDescriptor);              \
-  __macro(cudnnCreatePoolingDescriptor);                  \
-  __macro(cudnnDestroyPoolingDescriptor);                 \
-  __macro(cudnnSetConvolution2dDescriptor);               \
-  __macro(cudnnDestroyConvolutionDescriptor);             \
-  __macro(cudnnSetConvolutionNdDescriptor);               \
-  __macro(cudnnGetConvolutionNdDescriptor);               \
-  __macro(cudnnDeriveBNTensorDescriptor);                 \
-  __macro(cudnnCreateSpatialTransformerDescriptor);       \
-  __macro(cudnnSetSpatialTransformerNdDescriptor);        \
-  __macro(cudnnDestroySpatialTransformerDescriptor);      \
-  __macro(cudnnSpatialTfGridGeneratorForward);            \
-  __macro(cudnnSpatialTfGridGeneratorBackward);           \
-  __macro(cudnnSpatialTfSamplerForward);                  \
-  __macro(cudnnSpatialTfSamplerBackward);                 \
-  __macro(cudnnCreate);                                   \
-  __macro(cudnnDestroy);                                  \
-  __macro(cudnnSetStream);                                \
-  __macro(cudnnActivationForward);                        \
-  __macro(cudnnActivationBackward);                       \
-  __macro(cudnnConvolutionForward);                       \
-  __macro(cudnnConvolutionBackwardBias);                  \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize);       \
-  __macro(cudnnTransformTensor);                          \
-  __macro(cudnnPoolingForward);                           \
-  __macro(cudnnPoolingBackward);                          \
-  __macro(cudnnSoftmaxBackward);                          \
-  __macro(cudnnSoftmaxForward);                           \
-  __macro(cudnnGetVersion);                               \
-  __macro(cudnnFindConvolutionForwardAlgorithmEx);        \
-  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
-  __macro(cudnnFindConvolutionBackwardFilterAlgorithm);   \
-  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);   \
-  __macro(cudnnGetErrorString);                           \
-  __macro(cudnnCreateDropoutDescriptor);                  \
-  __macro(cudnnDropoutGetStatesSize);                     \
-  __macro(cudnnSetDropoutDescriptor);                     \
-  __macro(cudnnRestoreDropoutDescriptor);                 \
-  __macro(cudnnCreateRNNDescriptor);                      \
-  __macro(cudnnGetRNNParamsSize);                         \
-  __macro(cudnnGetRNNWorkspaceSize);                      \
-  __macro(cudnnGetRNNTrainingReserveSize);                \
-  __macro(cudnnRNNForwardTraining);                       \
-  __macro(cudnnRNNBackwardData);                          \
-  __macro(cudnnRNNBackwardWeights);                       \
-  __macro(cudnnRNNForwardInference);                      \
-  __macro(cudnnDestroyDropoutDescriptor);                 \
-  __macro(cudnnDestroyRNNDescriptor);                     \
-  __macro(cudnnSetTensorNdDescriptorEx);
-
-CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
-  __macro(cudnnAddTensor);                 \
-  __macro(cudnnConvolutionBackwardData);   \
-  __macro(cudnnConvolutionBackwardFilter);
-CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-
-// APIs available after R3:
-#if CUDNN_VERSION >= 3000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)           \
+#define CUDNN_DNN_ROUTINE_EACH(__macro)                    \
+  __macro(cudnnSetTensor4dDescriptor);                     \
+  __macro(cudnnSetTensor4dDescriptorEx);                   \
+  __macro(cudnnSetTensorNdDescriptor);                     \
+  __macro(cudnnGetTensorNdDescriptor);                     \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);          \
+  __macro(cudnnCreateTensorDescriptor);                    \
+  __macro(cudnnDestroyTensorDescriptor);                   \
+  __macro(cudnnCreateFilterDescriptor);                    \
+  __macro(cudnnSetFilter4dDescriptor);                     \
+  __macro(cudnnSetFilterNdDescriptor);                     \
+  __macro(cudnnGetFilterNdDescriptor);                     \
+  __macro(cudnnSetPooling2dDescriptor);                    \
+  __macro(cudnnSetPoolingNdDescriptor);                    \
+  __macro(cudnnGetPoolingNdDescriptor);                    \
+  __macro(cudnnDestroyFilterDescriptor);                   \
+  __macro(cudnnCreateConvolutionDescriptor);               \
+  __macro(cudnnCreatePoolingDescriptor);                   \
+  __macro(cudnnDestroyPoolingDescriptor);                  \
+  __macro(cudnnSetConvolution2dDescriptor);                \
+  __macro(cudnnDestroyConvolutionDescriptor);              \
+  __macro(cudnnSetConvolutionNdDescriptor);                \
+  __macro(cudnnGetConvolutionNdDescriptor);                \
+  __macro(cudnnDeriveBNTensorDescriptor);                  \
+  __macro(cudnnCreateSpatialTransformerDescriptor);        \
+  __macro(cudnnSetSpatialTransformerNdDescriptor);         \
+  __macro(cudnnDestroySpatialTransformerDescriptor);       \
+  __macro(cudnnSpatialTfGridGeneratorForward);             \
+  __macro(cudnnSpatialTfGridGeneratorBackward);            \
+  __macro(cudnnSpatialTfSamplerForward);                   \
+  __macro(cudnnSpatialTfSamplerBackward);                  \
+  __macro(cudnnCreate);                                    \
+  __macro(cudnnDestroy);                                   \
+  __macro(cudnnSetStream);                                 \
+  __macro(cudnnActivationForward);                         \
+  __macro(cudnnActivationBackward);                        \
+  __macro(cudnnConvolutionForward);                        \
+  __macro(cudnnConvolutionBackwardBias);                   \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize);        \
+  __macro(cudnnTransformTensor);                           \
+  __macro(cudnnPoolingForward);                            \
+  __macro(cudnnPoolingBackward);                           \
+  __macro(cudnnSoftmaxBackward);                           \
+  __macro(cudnnSoftmaxForward);                            \
+  __macro(cudnnGetVersion);                                \
+  __macro(cudnnFindConvolutionForwardAlgorithmEx);         \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx);  \
+  __macro(cudnnFindConvolutionBackwardFilterAlgorithm);    \
+  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);    \
+  __macro(cudnnGetErrorString);                            \
+  __macro(cudnnCreateDropoutDescriptor);                   \
+  __macro(cudnnDropoutGetStatesSize);                      \
+  __macro(cudnnSetDropoutDescriptor);                      \
+  __macro(cudnnRestoreDropoutDescriptor);                  \
+  __macro(cudnnCreateRNNDescriptor);                       \
+  __macro(cudnnGetRNNParamsSize);                          \
+  __macro(cudnnGetRNNWorkspaceSize);                       \
+  __macro(cudnnGetRNNTrainingReserveSize);                 \
+  __macro(cudnnRNNForwardTraining);                        \
+  __macro(cudnnRNNBackwardData);                           \
+  __macro(cudnnRNNBackwardWeights);                        \
+  __macro(cudnnRNNForwardInference);                       \
+  __macro(cudnnDestroyDropoutDescriptor);                  \
+  __macro(cudnnDestroyRNNDescriptor);                      \
+  __macro(cudnnSetTensorNdDescriptorEx);                   \
+  __macro(cudnnAddTensor);                                 \
+  __macro(cudnnConvolutionBackwardData);                   \
+  __macro(cudnnConvolutionBackwardFilter);                 \
   __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);   \
+  __macro(cudnnBatchNormalizationForwardTraining);         \
+  __macro(cudnnBatchNormalizationForwardInference);        \
+  __macro(cudnnBatchNormalizationBackward);                \
+  __macro(cudnnCreateActivationDescriptor);                \
+  __macro(cudnnSetActivationDescriptor);                   \
+  __macro(cudnnGetActivationDescriptor);                   \
+  __macro(cudnnDestroyActivationDescriptor);               \
+  __macro(cudnnSetRNNDescriptor_v6);
+CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
-// APIs available after R3:
-#if CUDNN_VERSION >= 3000 && CUDNN_VERSION < 8000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(__macro) \
+#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \
   __macro(cudnnGetConvolutionBackwardFilterAlgorithm);   \
   __macro(cudnnGetConvolutionForwardAlgorithm);          \
   __macro(cudnnGetConvolutionBackwardDataAlgorithm);     \
   __macro(cudnnSetRNNDescriptor);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-// APIs available after R4:
-#if CUDNN_VERSION >= 4007
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
-  __macro(cudnnBatchNormalizationForwardTraining);  \
-  __macro(cudnnBatchNormalizationForwardInference); \
-  __macro(cudnnBatchNormalizationBackward);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-// APIs in R5
-#if CUDNN_VERSION >= 5000
-#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)  \
-  __macro(cudnnCreateActivationDescriptor); \
-  __macro(cudnnSetActivationDescriptor);    \
-  __macro(cudnnGetActivationDescriptor);    \
-  __macro(cudnnDestroyActivationDescriptor);
-CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-// APIs in R6
-#if CUDNN_VERSION >= 6000
-#define CUDNN_DNN_ROUTINE_EACH_R6(__macro) __macro(cudnnSetRNNDescriptor_v6);
-CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 #if CUDNN_VERSION >= 7001
diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h
index 15de4c64e3e645..05b1fc891a0bf9 100644
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -116,7 +116,9 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(miopenPoolingForward);                          \
   __macro(miopenPoolingBackward);                         \
   __macro(miopenSoftmaxBackward);                         \
+  __macro(miopenSoftmaxBackward_V2);                      \
   __macro(miopenSoftmaxForward);                          \
+  __macro(miopenSoftmaxForward_V2);                       \
   __macro(miopenCreateDropoutDescriptor);                 \
   __macro(miopenDestroyDropoutDescriptor);                \
   __macro(miopenRestoreDropoutDescriptor);                \
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index d181660e311960..75e35d398c27e7 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -197,8 +197,7 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-#if defined(PADDLE_WITH_HIP) || \
-    (defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000)
+#if defined(PADDLE_WITH_HIP)
 ARITHMETIC_KERNEL(Add, +)
 ARITHMETIC_KERNEL(Sub, -)
 ARITHMETIC_KERNEL(Mul, *)
diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc
index 0be4233269e0f4..25ae0ab264f2d8 100644
--- a/paddle/fluid/platform/lodtensor_printer.cc
+++ b/paddle/fluid/platform/lodtensor_printer.cc
@@ -27,24 +27,38 @@ namespace paddle {
 namespace platform {
 
 void PrintVar(framework::Scope* scope, const std::string& var_name,
-              const std::string& print_info) {
+              const std::string& print_info, std::stringstream* sstream) {
   framework::Variable* var = scope->FindVar(var_name);
   if (var == nullptr) {
-    VLOG(1) << "Variable Name " << var_name << " does not exist in your scope";
+    VLOG(0) << "Variable Name " << var_name << " does not exist in your scope";
     return;
   }
   framework::LoDTensor* tensor = var->GetMutable<framework::LoDTensor>();
   if (tensor == nullptr) {
-    VLOG(1) << "tensor of variable " << var_name
+    VLOG(0) << "tensor of variable " << var_name
             << " does not exist in your scope";
     return;
   }
 
-  std::ostringstream sstream;
-  sstream << print_info << "\t";
-  sstream << var_name << "\t";
-  sstream << *tensor << "\t";
-  std::cout << sstream.str() << std::endl;
+  *sstream << print_info << ": ";
+
+#define PrintTensorCallback(cpp_type, proto_type) \
+  do {                                            \
+    if (tensor->type() == proto_type) {           \
+      *sstream << "[";                            \
+      auto* data = tensor->data<cpp_type>();      \
+      auto element_num = tensor->numel();         \
+      if (element_num > 0) {                      \
+        *sstream << data[0];                      \
+        for (int j = 1; j < element_num; ++j) {   \
+          *sstream << " " << data[j];             \
+        }                                         \
+      }                                           \
+      *sstream << "]";                            \
+    }                                             \
+  } while (0)
+
+  _ForEachDataType_(PrintTensorCallback);
 }
 
 }  // end namespace platform
diff --git a/paddle/fluid/platform/lodtensor_printer.h b/paddle/fluid/platform/lodtensor_printer.h
index e0bd1fff197f70..d30afb62b0b8c6 100644
--- a/paddle/fluid/platform/lodtensor_printer.h
+++ b/paddle/fluid/platform/lodtensor_printer.h
@@ -26,6 +26,6 @@ class Scope;
 namespace paddle {
 namespace platform {
 void PrintVar(framework::Scope* scope, const std::string& var_name,
-              const std::string& print_info);
+              const std::string& print_info, std::stringstream* out);
 }  // end namespace platform
 }  // end namespace paddle
diff --git a/paddle/fluid/platform/lodtensor_printer_test.cc b/paddle/fluid/platform/lodtensor_printer_test.cc
index 5b2af270740766..51bd55ebb7f488 100644
--- a/paddle/fluid/platform/lodtensor_printer_test.cc
+++ b/paddle/fluid/platform/lodtensor_printer_test.cc
@@ -18,5 +18,6 @@
 
 TEST(LodTensorPrinter, PrintVar) {
   paddle::framework::Scope scope;
-  paddle::platform::PrintVar(&scope, "NotAVar", "We don't have var");
+  std::stringstream ss;
+  paddle::platform::PrintVar(&scope, "NotAVar", "We don't have var", &ss);
 }
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 5c9655edfb71fa..10c79933546b3c 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -7,6 +7,10 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator)
 
+if (WITH_PSCORE)
+  set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
+  set(PYBIND_DEPS ${PYBIND_DEPS} graph_py_service)
+endif()
 if (WITH_GPU OR WITH_ROCM)
   set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda)
   set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard)
@@ -105,7 +109,7 @@ if(WITH_PYTHON)
   set(tmp_impl_file ${impl_file}.tmp)
 
   if(WIN32)
-      if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+    if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
       set(op_function_generator_path "${CMAKE_CURRENT_BINARY_DIR}")
     else()
       set(op_function_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}")
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.cc b/paddle/fluid/pybind/ascend_wrapper_py.cc
index 00eca380859527..303ab5c0fe8ca4 100644
--- a/paddle/fluid/pybind/ascend_wrapper_py.cc
+++ b/paddle/fluid/pybind/ascend_wrapper_py.cc
@@ -32,6 +32,8 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
+#include "paddle/fluid/platform/ascend_npu_info.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/ascend_wrapper_py.h"
 
 using namespace ge;  // NOLINT
@@ -40,6 +42,12 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 
+#ifdef PADDLE_WITH_ASCEND_STRING
+using AscendString = AscendString;
+#else
+using AscendString = std::string;
+#endif
+
 void BindAscendWrapper(py::module *m) {
   py::class_<framework::AscendInstance,
              std::shared_ptr<framework::AscendInstance>>(*m, "AscendInstance")
@@ -47,13 +55,31 @@ void BindAscendWrapper(py::module *m) {
       .def("init_global_resources",
            &framework::AscendInstance::InitGlobalResouces,
            py::call_guard<py::gil_scoped_release>())
+      .def("destroy_global_resources",
+           &framework::AscendInstance::DestroyGlobalResouces,
+           py::call_guard<py::gil_scoped_release>())
       .def("add_ascend_subgraph", &framework::AscendInstance::AddAscendSubgraph,
            py::call_guard<py::gil_scoped_release>());
-}  // end AscendWrapper
+}
 
-Status ge_initialize(std::map<std::string, std::string> &options) {  // NOLINT
+std::map<AscendString, AscendString> convert_map(
+    const std::map<std::string, std::string> &options) {
+  std::map<AscendString, AscendString> rets;
+  for (auto &option : options) {
+    AscendString key = option.first.c_str();
+    AscendString val = option.second.c_str();
+    rets[key] = val;
+  }
+  return rets;
+}
+
+ge::Status ge_initialize(
+    std::map<std::string, std::string> &options) {  // NOLINT
   py::gil_scoped_release release;
-  Status res = GEInitialize(options);
+  auto init_options = convert_map(options);
+  ge::Status res = ge::GEInitialize(init_options);
+  PADDLE_ENFORCE_EQ(res, ge::SUCCESS, platform::errors::Fatal(
+                                          "ge initialize not success:%d", res));
   py::gil_scoped_acquire acquire;
   return res;
 }
@@ -82,11 +108,18 @@ enum AttrType {
   AT_NAMEATTR
 };
 
+void BindAscendDevice(py::module *m) {
+  py::class_<platform::ascend::NPUDevice>(*m, "NPUDevice")
+      .def_static(
+          "get_device_count",
+          static_cast<int (*)()>(&platform::ascend::NPUDevice::GetDeviceCount));
+}
+
 void BindAscendGraph(py::module *m) {
   m->def("ge_initialize", &ge_initialize, "GEInitialize");
   m->def("ge_finalize", &GEFinalize, "GEFinalize");
 
-  //枚举封装
+  // enum
   py::enum_<GraphRunMode>(*m, "GEGraphRunMode")
       .value("PREDICTION", GraphRunMode::PREDICTION)
       .value("TRAIN", GraphRunMode::TRAIN)
@@ -214,24 +247,34 @@ void BindAscendGraph(py::module *m) {
 
   // 类封装
   py::class_<Session>(*m, "GESession")
-      .def(py::init<const std::map<std::string, std::string> &>())
+      .def(py::init([](const std::map<std::string, std::string> &options) {
+        return std::unique_ptr<ge::Session>(
+            new ge::Session(convert_map(options)));
+      }))
+      .def("add_graph", (ge::Status (Session::*)(uint32_t, const Graph &)) &
+                            Session::AddGraph)
       .def("add_graph",
-           (Status (Session::*)(uint32_t, const Graph &)) & Session::AddGraph)
-      .def("add_graph",
-           (Status (Session::*)(uint32_t, const Graph &,
-                                const std::map<std::string, std::string> &)) &
-               Session::AddGraph)
+           [](Session &ss, uint32_t index, const Graph &graph,
+              const std::map<std::string, std::string> &options) {
+             return ss.AddGraph(index, graph, convert_map(options));
+           })
       .def("remove_graph", &Session::RemoveGraph)
       .def("run_graph",
            [](Session &ss, uint32_t graphId,
               const std::vector<Tensor> &inputs) -> py::tuple {
              std::vector<Tensor> outputs;
-             Status res = ss.RunGraph(graphId, inputs, outputs);
+             ge::Status res = ss.RunGraph(graphId, inputs, outputs);
              return py::make_tuple(outputs, res);
            },
            py::call_guard<py::gil_scoped_release>())
       .def("build_graph", &Session::BuildGraph)
       .def("run_graph_async", &Session::RunGraphAsync)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("register_call_back_func",
+           static_cast<ge::Status (ge::Session::*)(  // NOLINT
+               const char *, const ge::session::pCallBackFunc &)>(
+               &ge::Session::RegisterCallBackFunc))
+#else
       .def("register_call_back_func",
            (Status (Session::*)(  // NOLINT
                const std::string &,
@@ -239,11 +282,12 @@ void BindAscendGraph(py::module *m) {
                    uint32_t graph_id,
                    const std::map<std::string, ge::Tensor> &params_list)>)) &
                Session::RegisterCallBackFunc)
+#endif
       .def("is_graph_need_rebuild", &Session::IsGraphNeedRebuild);
 
   py::class_<Graph>(*m, "GEGraph")
       .def(py::init<>())
-      .def(py::init<const std::string &>())
+      .def(py::init<const char *>())
       .def("set_inputs", &Graph::SetInputs)
       .def("set_outputs", (Graph & (Graph::*)(const std::vector<Operator> &)) &
                               Graph::SetOutputs)
@@ -253,40 +297,70 @@ void BindAscendGraph(py::module *m) {
                Graph::SetOutputs)
       .def("set_outputs",
            (Graph &
-            (Graph::*)(const std::vector<std::pair<ge::Operator, std::string>>
+            (Graph::*)(const std::vector<std::pair<ge::Operator, AscendString>>
                            &)) &
                Graph::SetOutputs)
       .def("set_targets", &Graph::SetTargets)
       .def("is_valid", &Graph::IsValid)
       .def("add_op", &Graph::AddOp)
       .def("find_op_by_name",
-           [](Graph &graph, const std::string &name) -> py::tuple {
+           [](Graph &graph, const char *name) -> py::tuple {
              ge::Operator op;
              graphStatus status = graph.FindOpByName(name, op);
              return py::make_tuple(op, status);
            })
       .def("find_op_by_type",
-           [](Graph &graph, const std::string &type) -> py::tuple {
+           [](Graph &graph, const char *type) -> py::tuple {
              std::vector<ge::Operator> ops;
              graphStatus status = graph.FindOpByType(type, ops);
              return py::make_tuple(ops, status);
            })
       .def("get_all_op_name",
            [](Graph &graph) -> py::tuple {
-             std::vector<std::string> op_name;
+             std::vector<AscendString> op_name;
              graphStatus status = graph.GetAllOpName(op_name);
              return py::make_tuple(op_name, status);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("save_to_file",
+           static_cast<ge::graphStatus (ge::Graph::*)(const char *) const>(
+               &ge::Graph::SaveToFile))
+      .def("load_from_file",
+           static_cast<ge::graphStatus (ge::Graph::*)(const char *)>(
+               &Graph::LoadFromFile))
+      .def("get_name",
+           static_cast<ge::graphStatus (ge::Graph::*)(AscendString &) const>(
+               &Graph::GetName))
+#else
       .def("save_to_file", &Graph::SaveToFile)
       .def("load_from_file", &Graph::LoadFromFile)
       .def("get_name", &Graph::GetName)
+#endif
       .def("set_need_iteration", &Graph::SetNeedIteration);
 
   py::class_<Operator>(*m, "GEOperator")
       .def(py::init<>())
-      .def(py::init<const std::string &>())
-      .def(py::init<const std::string &, const std::string &>())
+      .def(py::init<const char *>())
+      .def(py::init<const char *, const char *>())
       .def("is_empty", &Operator::IsEmpty)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("get_name",
+           static_cast<ge::graphStatus (ge::Operator::*)(AscendString &) const>(
+               &Operator::GetName))
+      .def("get_op_type",
+           static_cast<ge::graphStatus (ge::Operator::*)(AscendString &) const>(
+               &Operator::GetOpType))
+      .def("set_input",
+           (Operator & (Operator::*)(const char *, const Operator &)) &
+               Operator::SetInput)
+      .def("set_input",
+           (Operator &
+            (Operator::*)(const char *, const Operator &, const char *)) &
+               Operator::SetInput)
+      .def("set_input", (Operator & (Operator::*)(const char *,
+                                                  const Operator &, uint32_t)) &
+                            Operator::SetInput)
+#else
       .def("get_name", &Operator::GetName)
       .def("get_op_type", &Operator::GetOpType)
       .def("set_input",
@@ -299,13 +373,28 @@ void BindAscendGraph(py::module *m) {
       .def("set_input", (Operator & (Operator::*)(const std::string &,
                                                   const Operator &, uint32_t)) &
                             Operator::SetInput)
+#endif
       .def("add_control_input", &Operator::AddControlInput)
       .def("get_input_const_data",
-           [](Operator &op, const std::string &dst_name) -> py::tuple {
+           [](Operator &op, const char *dst_name) -> py::tuple {
              Tensor data;
              graphStatus res = op.GetInputConstData(dst_name, data);
              return py::make_tuple(data, res);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("get_input_desc",
+           (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetInputDesc)
+      .def("get_input_desc",
+           [](Operator &op, const std::string &name) {
+             return op.GetInputDescByName(name.c_str());
+           })
+      .def("get_dynamic_output_num",
+           static_cast<int (ge::Operator::*)(const char *) const>(
+               &Operator::GetDynamicOutputNum))
+      .def("get_dynamic_input_num",
+           static_cast<int (ge::Operator::*)(const char *) const>(
+               &Operator::GetDynamicInputNum))
+#else
       .def("get_input_desc",
            (TensorDesc (Operator::*)(const std::string &) const) &
                Operator::GetInputDesc)
@@ -313,12 +402,41 @@ void BindAscendGraph(py::module *m) {
            (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetInputDesc)
       .def("get_dynamic_output_num", &Operator::GetDynamicOutputNum)
       .def("get_dynamic_input_num", &Operator::GetDynamicInputNum)
+#endif
       .def("try_get_input_desc",
-           [](Operator &op, const std::string &name) -> py::tuple {
+           [](Operator &op, const char *name) -> py::tuple {
              TensorDesc tensor_desc;
              graphStatus status = op.TryGetInputDesc(name, tensor_desc);
              return py::make_tuple(tensor_desc, status);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("update_input_desc",
+           static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
+               const char *, const TensorDesc &)>(&Operator::UpdateInputDesc))
+      .def("get_output_desc",
+           [](Operator &op, const std::string &name) {
+             return op.GetOutputDescByName(name.c_str());
+           })
+      .def("get_output_desc",
+           (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetOutputDesc)
+      .def("update_output_desc",
+           static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
+               const char *, const TensorDesc &)>(&Operator::UpdateOutputDesc))
+      .def("get_dynamic_input_desc",
+           static_cast<ge::TensorDesc (ge::Operator::*)(const char *, uint32_t)
+                           const>(&Operator::GetDynamicInputDesc))
+      .def("update_dynamic_input_desc",
+           static_cast<ge::graphStatus (ge::Operator::*)(const char *, uint32_t,
+                                                         const TensorDesc &)>(
+               &Operator::UpdateDynamicInputDesc))
+      .def("get_dynamic_output_desc",
+           static_cast<ge::TensorDesc (ge::Operator::*)(const char *, uint32_t)
+                           const>(&Operator::GetDynamicOutputDesc))
+      .def("update_dynamic_output_desc",
+           static_cast<ge::graphStatus (ge::Operator::*)(const char *, uint32_t,
+                                                         const TensorDesc &)>(
+               &Operator::UpdateDynamicOutputDesc))
+#else
       .def("update_input_desc", &Operator::UpdateInputDesc)
       .def("get_output_desc",
            (TensorDesc (Operator::*)(const std::string &) const) &
@@ -330,33 +448,38 @@ void BindAscendGraph(py::module *m) {
       .def("update_dynamic_input_desc", &Operator::UpdateDynamicInputDesc)
       .def("get_dynamic_output_desc", &Operator::GetDynamicOutputDesc)
       .def("update_dynamic_output_desc", &Operator::UpdateDynamicOutputDesc)
+#endif
       .def("infer_shape_and_type", &Operator::InferShapeAndType)
       .def("set_inference_context", &Operator::SetInferenceContext)
       .def("get_inference_context", &Operator::GetInferenceContext)
       .def("verify_all_attr", &Operator::VerifyAllAttr)
       .def("get_inputs_size", &Operator::GetInputsSize)
       .def("get_outputs_size", &Operator::GetOutputsSize)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("get_all_attr_names_and_types",
+           static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
+               std::map<AscendString, AscendString> &) const>(
+               &Operator::GetAllAttrNamesAndTypes))
+#else
       .def("get_all_attr_names_and_types", &Operator::GetAllAttrNamesAndTypes)
+#endif
       .def("set_attr_int64",
-           [](Operator &op, const std::string &name,
-              int64_t value) -> Operator & {
+           [](Operator &op, const char *name, int64_t value) -> Operator & {
              int64_t tar = (int64_t)value;
              return op.SetAttr(name, tar);
            })
       .def("set_attr_int32",
-           [](Operator &op, const std::string &name,
-              int32_t value) -> Operator & {
+           [](Operator &op, const char *name, int32_t value) -> Operator & {
              int32_t tar = (int32_t)value;
              return op.SetAttr(name, tar);
            })
       .def("set_attr_uint32",
-           [](Operator &op, const std::string &name,
-              uint32_t value) -> Operator & {
+           [](Operator &op, const char *name, uint32_t value) -> Operator & {
              uint32_t tar = (uint32_t)value;
              return op.SetAttr(name, tar);
            })
       .def("set_attr_vec_int64",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<int64_t> &value) -> Operator & {
              int len = value.size();
              std::vector<int64_t> tar;
@@ -368,7 +491,7 @@ void BindAscendGraph(py::module *m) {
              return op.SetAttr(name, tar);
            })
       .def("set_attr_vec_int32",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<int32_t> &value) -> Operator & {
              int len = value.size();
              std::vector<int32_t> tar;
@@ -380,7 +503,7 @@ void BindAscendGraph(py::module *m) {
              return op.SetAttr(name, tar);
            })
       .def("set_attr_vec_uint32",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<uint32_t> &value) -> Operator & {
              int len = value.size();
              std::vector<uint32_t> tar;
@@ -392,21 +515,20 @@ void BindAscendGraph(py::module *m) {
              return op.SetAttr(name, tar);
            })
       .def("set_attr_list_int64",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               std::initializer_list<int64_t> &attrValue) -> Operator & {
              return op.SetAttr(name, std::move(attrValue));
            })
       .def("set_attr_attrvalue",
-           [](Operator &op, const std::string &name, AttrValue &attrValue)
+           [](Operator &op, const char *name, AttrValue &attrValue)
                -> Operator & { return op.SetAttr(name, std::move(attrValue)); })
-      .def(
-          "set_attr_float",
-          [](Operator &op, const std::string &name, float value) -> Operator & {
-            float tar = static_cast<float>(value);
-            return op.SetAttr(name, tar);
-          })
+      .def("set_attr_float",
+           [](Operator &op, const char *name, float value) -> Operator & {
+             float tar = static_cast<float>(value);
+             return op.SetAttr(name, tar);
+           })
       .def("set_attr_vec_float",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<float> &value) -> Operator & {
              int len = value.size();
              std::vector<float> tar;
@@ -417,6 +539,15 @@ void BindAscendGraph(py::module *m) {
              }
              return op.SetAttr(name, tar);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("set_attr_string",
+           (Operator & (Operator::*)(const char *, const char *)) &
+               Operator::SetAttr)
+      .def("set_attr_vec_string",
+           (Operator &
+            (Operator::*)(const char *, const std::vector<AscendString> &)) &
+               Operator::SetAttr)
+#else
       .def("set_attr_string", (Operator & (Operator::*)(const std::string &,
                                                         const std::string &)) &
                                   Operator::SetAttr)
@@ -424,15 +555,16 @@ void BindAscendGraph(py::module *m) {
            (Operator & (Operator::*)(const std::string &,
                                      const std::vector<std::string> &)) &
                Operator::SetAttr)
+#endif
       .def("set_attr_bool",
-           [](Operator &op, const std::string &name, bool value) -> Operator & {
+           [](Operator &op, const char *name, bool value) -> Operator & {
              if (value)
                return op.SetAttr(name, true);
              else
                return op.SetAttr(name, false);
            })
       .def("set_attr_vec_bool",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<bool> &value) -> Operator & {
              int len = value.size();
              std::vector<bool> tar;
@@ -444,6 +576,15 @@ void BindAscendGraph(py::module *m) {
              }
              return op.SetAttr(name, tar);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("set_attr_tensor",
+           (Operator & (Operator::*)(const char *, const Tensor &)) &
+               Operator::SetAttr)
+      .def("set_attr_vec_tensor",
+           (Operator &
+            (Operator::*)(const char *, const std::vector<Tensor> &)) &
+               Operator::SetAttr)
+#else
       .def("set_attr_tensor",
            (Operator & (Operator::*)(const std::string &, const Tensor &)) &
                Operator::SetAttr)
@@ -451,8 +592,9 @@ void BindAscendGraph(py::module *m) {
            (Operator &
             (Operator::*)(const std::string &, const std::vector<Tensor> &)) &
                Operator::SetAttr)
+#endif
       .def("set_attr_vec_uint8",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<uint8_t> &value) -> Operator & {
              int len = value.size();
              std::vector<uint8_t> tar;
@@ -463,13 +605,21 @@ void BindAscendGraph(py::module *m) {
              }
              return op.SetAttr(name, tar);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("set_attr_vec_vec_int64",
+           (Operator &
+            (Operator::*)(const char *,
+                          const std::vector<std::vector<int64_t>> &)) &
+               Operator::SetAttr)
+#else
       .def("set_attr_vec_vec_int64",
            (Operator &
             (Operator::*)(const std::string &,
                           const std::vector<std::vector<int64_t>> &)) &
                Operator::SetAttr)
+#endif
       .def("set_attr_vec_dtype",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const std::vector<DataType> &value) -> Operator & {
              int len = value.size();
              std::vector<ge::DataType> tar;
@@ -481,15 +631,13 @@ void BindAscendGraph(py::module *m) {
              return op.SetAttr(name, tar);
            })
       .def("set_attr_dtype",
-           [](Operator &op, const std::string &name,
+           [](Operator &op, const char *name,
               const DataType &value) -> Operator & {
              ge::DataType tar = (ge::DataType)value;
              return op.SetAttr(name, tar);
            })
-
       .def("get_attr",
-           [](Operator &op, const std::string &name,
-              AttrType type) -> py::tuple {
+           [](Operator &op, const char *name, AttrType type) -> py::tuple {
              graphStatus res = -1;
              switch (type) {
                case AT_INT64: {
@@ -538,12 +686,12 @@ void BindAscendGraph(py::module *m) {
                  return py::make_tuple(o_av, res);
                } break;
                case AT_STRING: {
-                 std::string s_av;
+                 AscendString s_av;
                  res = op.GetAttr(name, s_av);
                  return py::make_tuple(s_av, res);
                } break;
                case AT_LIST_STRING: {
-                 std::vector<std::string> v_s_av;
+                 std::vector<AscendString> v_s_av;
                  res = op.GetAttr(name, v_s_av);
                  return py::make_tuple(v_s_av, res);
                } break;
@@ -594,11 +742,31 @@ void BindAscendGraph(py::module *m) {
            })
       .def("break_connect", &Operator::BreakConnect)
       .def("get_subgraph_names_count", &Operator::GetSubgraphNamesCount)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("get_subgraph_names",
+           static_cast<ge::graphStatus (ge::Operator::*)(  // NOLINT
+               std::vector<AscendString> &) const>(&Operator::GetSubgraphNames))
+      .def("get_subgraph_builder",
+           static_cast<ge::SubgraphBuilder (ge::Operator::*)(const char *)
+                           const>(&Operator::GetSubgraphBuilder))
+      .def("get_subgraph",
+           static_cast<ge::Graph (ge::Operator::*)(const char *) const>(
+               &Operator::GetSubgraph))
+      .def("get_dynamic_subgraph_builder",
+           static_cast<ge::SubgraphBuilder (ge::Operator::*)(const char *,
+                                                             uint32_t) const>(
+               &Operator::GetDynamicSubgraphBuilder))
+      .def("get_dynamic_subgraph",
+           static_cast<ge::Graph (ge::Operator::*)(const char *, uint32_t)
+                           const>(&Operator::GetDynamicSubgraph));
+#else
+      .def("get_subgraph_names_count", &Operator::GetSubgraphNamesCount)
       .def("get_subgraph_names", &Operator::GetSubgraphNames)
       .def("get_subgraph_builder", &Operator::GetSubgraphBuilder)
       .def("get_subgraph", &Operator::GetSubgraph)
       .def("get_dynamic_subgraph_builder", &Operator::GetDynamicSubgraphBuilder)
       .def("get_dynamic_subgraph", &Operator::GetDynamicSubgraph);
+#endif
 
   py::class_<Tensor>(*m, "GETensor")
       .def(py::init<>())
@@ -613,10 +781,15 @@ void BindAscendGraph(py::module *m) {
                            Tensor::SetData)
       .def("set_data",
            (graphStatus (Tensor::*)(const uint8_t *, size_t)) & Tensor::SetData)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("set_data",
+           (graphStatus (Tensor::*)(const char *)) & Tensor::SetData)
+#else
       .def("set_data",
            (graphStatus (Tensor::*)(const std::string &)) & Tensor::SetData)
+#endif
       .def("set_data",
-           (graphStatus (Tensor::*)(const std::vector<std::string> &)) &
+           (graphStatus (Tensor::*)(const std::vector<AscendString> &)) &
                Tensor::SetData)
 
       .def("get_data",
@@ -638,8 +811,8 @@ void BindAscendGraph(py::module *m) {
       .def(py::init<Shape, Format, DataType>(), py::arg("shape"),
            py::arg("format") = FORMAT_ND, py::arg("dt") = DT_FLOAT)
       .def(py::init<const TensorDesc &>())
-      .def("update",
-           (void (TensorDesc::*)(Shape, Format, DataType)) & TensorDesc::Update,
+      .def("update", (void (TensorDesc::*)(const Shape &, Format, DataType)) &
+                         TensorDesc::Update,
            py::arg("shape"), py::arg("format") = FORMAT_ND,
            py::arg("dt") = DT_FLOAT)
       .def("set_shape", &TensorDesc::SetShape)
@@ -660,8 +833,16 @@ void BindAscendGraph(py::module *m) {
       .def("get_origin_format", &TensorDesc::GetOriginFormat)
       .def("set_data_type", &TensorDesc::SetDataType)
       .def("get_data_type", &TensorDesc::GetDataType)
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def("set_name", static_cast<void (ge::TensorDesc::*)(const char *)>(
+                           &TensorDesc::SetName))
+      .def("get_name",
+           static_cast<ge::graphStatus (ge::TensorDesc::*)(AscendString &)>(
+               &TensorDesc::GetName))
+#else
       .def("set_name", &TensorDesc::SetName)
       .def("get_name", &TensorDesc::GetName)
+#endif
       .def("set_size", &TensorDesc::SetSize)
       .def("get_size", &TensorDesc::GetSize)
       .def("set_real_dim_cnt", &TensorDesc::SetRealDimCnt)
@@ -679,16 +860,27 @@ void BindAscendGraph(py::module *m) {
   py::class_<AttrValue>(*m, "GEAttrValue").def(py::init<>());
 
   py::class_<OperatorFactory>(*m, "GEOperatorFactory")
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def_static("create_operator",
+                  static_cast<ge::Operator (*)(const char *, const char *)>(
+                      &ge::OperatorFactory::CreateOperator))
+#else
       .def("create_operator", &OperatorFactory::CreateOperator)
+#endif
       .def("get_ops_type_list",
            []() -> py::tuple {
-             std::vector<std::string> all_ops;
+             std::vector<AscendString> all_ops;
              graphStatus status = OperatorFactory::GetOpsTypeList(all_ops);
              return py::make_tuple(all_ops, status);
            })
+#ifdef PADDLE_WITH_ASCEND_STRING
+      .def_static("is_exist_op", static_cast<bool (*)(const char *)>(
+                                     &OperatorFactory::IsExistOp));
+#else
       .def("is_exist_op", &OperatorFactory::IsExistOp);
+#endif
 }
 
-}  // end namespace pybind
-}  // end namespace paddle
+}  // namespace pybind
+}  // namespace paddle
 #endif
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.h b/paddle/fluid/pybind/ascend_wrapper_py.h
index 4af96d6ef4b92a..e999080544c31b 100644
--- a/paddle/fluid/pybind/ascend_wrapper_py.h
+++ b/paddle/fluid/pybind/ascend_wrapper_py.h
@@ -25,6 +25,7 @@ namespace pybind {
 
 void BindAscendGraph(py::module* m);
 void BindAscendWrapper(py::module* m);
+void BindAscendDevice(py::module* m);
 
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index ba716fb3b550ac..0a2159667f3525 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -32,6 +32,8 @@ limitations under the License. */
 #include "paddle/fluid/distributed/fleet.h"
 #include "paddle/fluid/distributed/service/communicator.h"
 #include "paddle/fluid/distributed/service/env.h"
+#include "paddle/fluid/distributed/service/graph_brpc_client.h"
+#include "paddle/fluid/distributed/service/graph_py_service.h"
 #include "paddle/fluid/distributed/service/heter_client.h"
 
 namespace py = pybind11;
@@ -39,6 +41,11 @@ using paddle::distributed::CommContext;
 using paddle::distributed::Communicator;
 using paddle::distributed::FleetWrapper;
 using paddle::distributed::HeterClient;
+using paddle::distributed::GraphPyService;
+using paddle::distributed::GraphNode;
+using paddle::distributed::GraphPyServer;
+using paddle::distributed::GraphPyClient;
+using paddle::distributed::FeatureNode;
 
 namespace paddle {
 namespace pybind {
@@ -152,5 +159,58 @@ void BindHeterClient(py::module* m) {
       .def("stop", &HeterClient::Stop);
 }
 
+void BindGraphNode(py::module* m) {
+  py::class_<GraphNode>(*m, "GraphNode")
+      .def(py::init<>())
+      .def("get_id", &GraphNode::get_id)
+      .def("get_feature", &GraphNode::get_feature);
+}
+void BindGraphPyFeatureNode(py::module* m) {
+  py::class_<FeatureNode>(*m, "FeatureNode")
+      .def(py::init<>())
+      .def("get_id", &GraphNode::get_id)
+      .def("get_feature", &GraphNode::get_feature);
+}
+
+void BindGraphPyService(py::module* m) {
+  py::class_<GraphPyService>(*m, "GraphPyService").def(py::init<>());
+}
+
+void BindGraphPyServer(py::module* m) {
+  py::class_<GraphPyServer>(*m, "GraphPyServer")
+      .def(py::init<>())
+      .def("start_server", &GraphPyServer::start_server)
+      .def("set_up", &GraphPyServer::set_up)
+      .def("add_table_feat_conf", &GraphPyServer::add_table_feat_conf);
+}
+void BindGraphPyClient(py::module* m) {
+  py::class_<GraphPyClient>(*m, "GraphPyClient")
+      .def(py::init<>())
+      .def("load_edge_file", &GraphPyClient::load_edge_file)
+      .def("load_node_file", &GraphPyClient::load_node_file)
+      .def("set_up", &GraphPyClient::set_up)
+      .def("add_table_feat_conf", &GraphPyClient::add_table_feat_conf)
+      .def("pull_graph_list", &GraphPyClient::pull_graph_list)
+      .def("start_client", &GraphPyClient::start_client)
+      .def("batch_sample_neighboors", &GraphPyClient::batch_sample_neighboors)
+      .def("random_sample_nodes", &GraphPyClient::random_sample_nodes)
+      .def("stop_server", &GraphPyClient::stop_server)
+      .def("get_node_feat",
+           [](GraphPyClient& self, std::string node_type,
+              std::vector<uint64_t> node_ids,
+              std::vector<std::string> feature_names) {
+             auto feats =
+                 self.get_node_feat(node_type, node_ids, feature_names);
+             std::vector<std::vector<py::bytes>> bytes_feats(feats.size());
+             for (int i = 0; i < feats.size(); ++i) {
+               for (int j = 0; j < feats[i].size(); ++j) {
+                 bytes_feats[i].push_back(py::bytes(feats[i][j]));
+               }
+             }
+             return bytes_feats;
+           })
+      .def("bind_local_server", &GraphPyClient::bind_local_server);
+}
+
 }  // end namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/fleet_py.h b/paddle/fluid/pybind/fleet_py.h
index 7f471598ad2818..11b430cd208fd3 100644
--- a/paddle/fluid/pybind/fleet_py.h
+++ b/paddle/fluid/pybind/fleet_py.h
@@ -27,6 +27,10 @@ void BindPSHost(py::module* m);
 void BindCommunicatorContext(py::module* m);
 void BindDistCommunicator(py::module* m);
 void BindHeterClient(py::module* m);
-
+void BindGraphNode(py::module* m);
+void BindGraphPyService(py::module* m);
+void BindGraphPyFeatureNode(py::module* m);
+void BindGraphPyServer(py::module* m);
+void BindGraphPyClient(py::module* m);
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index eed3b3b7691b1e..4ab507fe367254 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/bkcl_context.h"
 #include "paddle/fluid/imperative/data_loader.h"
+#include "paddle/fluid/imperative/hooks.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/nccl_context.h"
 #include "paddle/fluid/imperative/partial_grad_engine.h"
@@ -63,6 +64,65 @@ class Layer : public imperative::Layer {
   }
 };
 
+template <typename T>
+static T PyObjectCast(PyObject *obj) {
+  try {
+    return py::cast<T>(py::handle(obj));
+  } catch (py::cast_error &) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Python object is not type of %s", typeid(T).name()));
+  }
+}
+
+class PyVariableWrapperHook : public imperative::VariableWrapperHook {
+ public:
+  explicit PyVariableWrapperHook(PyObject *func) : py_func_(func) {
+    Py_INCREF(py_func_);
+  }
+
+  ~PyVariableWrapperHook() {
+    py::gil_scoped_acquire gil;
+    Py_DECREF(py_func_);
+  }
+
+  std::shared_ptr<imperative::VariableWrapper> operator()(
+      const std::shared_ptr<imperative::VariableWrapper> &var) override {
+    py::gil_scoped_acquire gil;
+    VLOG(3) << "Call PyVariableWrapperHook for var " << var->Name();
+
+    // 1. unpack temp VarBase from VariableWrapper
+    std::shared_ptr<imperative::VarBase> tmp_varbase =
+        std::make_shared<imperative::VarBase>(var);
+
+    // 2. call hook and return
+    PyObject *res = nullptr;
+    try {
+      res = PyObject_CallFunctionObjArgs(py_func_, py::cast(tmp_varbase).ptr(),
+                                         nullptr);
+    } catch (platform::EnforceNotMet &e) {
+      throw std::move(e);
+    } catch (std::exception &e) {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Hook function of Tensor raises an exception: %s.", e.what()));
+    } catch (...) {
+      PADDLE_THROW(platform::errors::Fatal(
+          "Hook function of Tensor raises an unknown exception."));
+    }
+
+    PADDLE_ENFORCE_NOT_NULL(res,
+                            platform::errors::Unavailable(
+                                "Hook function of Tensor return a nullptr."));
+    if (res == Py_None) {
+      return var;
+    }
+
+    return PyObjectCast<std::shared_ptr<imperative::VarBase>>(res)->SharedVar();
+  }
+
+ private:
+  PyObject *py_func_;
+};
+
 static const platform::Place PyObjectToPlace(const py::object &place_obj) {
   if (py::isinstance<platform::CPUPlace>(place_obj)) {
     return place_obj.cast<platform::CPUPlace>();
@@ -213,16 +273,6 @@ static std::string GetTypeName(const imperative::VarBase &var) {
 
 using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>;
 
-template <typename T>
-static T PyObjectCast(PyObject *obj) {
-  try {
-    return py::cast<T>(py::handle(obj));
-  } catch (py::cast_error &) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Python object is not type of %s", typeid(T).name()));
-  }
-}
-
 // NOTE(zjl): py::handle is a very light wrapper of PyObject *.
 // Unlike py::object, py::handle does not change reference count of PyObject *.
 static std::vector<std::shared_ptr<imperative::VarBase>>
@@ -494,6 +544,39 @@ void BindImperative(py::module *m_ptr) {
       },
       py::return_value_policy::take_ownership);
 
+  m.def("_array_to_share_memory_tensor",
+        [](py::object &obj) {
+          // 1. cast to python array
+          auto array = obj.cast<py::array>();
+          PADDLE_ENFORCE_NE(
+              string::Sprintf("%s", array.dtype()).compare("object"), 0,
+              platform::errors::InvalidArgument(
+                  "Faild to convert input data to a regular ndarray.\n  * "
+                  "Usually this means the input data contains nested "
+                  "lists with different lengths.\n  * Check the reader "
+                  "function passed to 'set_(sample/sample_list/batch)"
+                  "_generator' to locate the data causes this issue."));
+          // 2. construcct LoDTensor
+          framework::LoDTensor t;
+          SetTensorFromPyArray<platform::CPUPlace>(&t, array,
+                                                   platform::CPUPlace(), true);
+          // 3. allocate shared memory
+          void *data_ptr = t.data<void>();
+          size_t data_size = t.numel() * framework::SizeOfType(t.type());
+          auto shared_writer_holder =
+              memory::allocation::AllocateMemoryMapWriterAllocation(data_size);
+          // 4. maintain mmap fd set & backup ipc_name
+          const std::string &ipc_name = shared_writer_holder->ipc_name();
+          memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
+          // 5. copy data & reset holder
+          memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
+                       platform::CPUPlace(), data_ptr, data_size);
+          t.ResetHolder(shared_writer_holder);
+
+          return t;
+        },
+        py::return_value_policy::take_ownership);
+
   m.def("_remove_tensor_list_mmap_fds", [](py::list &tensor_list) {
     for (size_t i = 0; i < tensor_list.size(); ++i) {
       auto t = tensor_list[i].cast<framework::LoDTensor>();
@@ -722,6 +805,7 @@ void BindImperative(py::module *m_ptr) {
          Bump the version whenever the Tensor is modified through an inplace operation.
             )DOC")
       .def("numpy",
+
            [](imperative::VarBase &self) -> py::array {
              const auto &tensor =
                  self.MutableVar()->Get<framework::LoDTensor>();
@@ -920,18 +1004,6 @@ void BindImperative(py::module *m_ptr) {
               print(x.stop_gradient) # True
               print(x.grad)          # None
        )DOC")
-      .def("_run_backward",
-           [](imperative::VarBase &self, const imperative::Tracer &tracer,
-              bool retain_graph) {
-             // TODO(jiabin): when we impl more backward execution we can
-             // select them
-             auto *engine = tracer.GetEngine();
-             engine->Init(&self, retain_graph);
-             VLOG(3) << "Start backward";
-             engine->Execute();
-             VLOG(3) << "Finish backward";
-           },
-           py::call_guard<py::gil_scoped_release>())
       .def("_grad_name", &imperative::VarBase::GradVarName)
       .def("_grad_value",
            [](imperative::VarBase &self) {
@@ -990,6 +1062,23 @@ void BindImperative(py::module *m_ptr) {
              }
            },
            py::call_guard<py::gil_scoped_release>())
+      .def("_register_grad_hook",
+           [](imperative::VarBase &self, const py::handle &hook) {
+             PADDLE_ENFORCE_EQ(
+                 self.HasGradVar(), true,
+                 platform::errors::InvalidArgument(
+                     "Cannot register hook on a tensor without gradient."));
+             return self.GradVarBase()->AddHook(
+                 std::make_shared<PyVariableWrapperHook>(hook.ptr()));
+           })
+      .def("_remove_grad_hook",
+           [](imperative::VarBase &self, int64_t hook_id) {
+             PADDLE_ENFORCE_EQ(
+                 self.HasGradVar(), true,
+                 platform::errors::InvalidArgument(
+                     "Cannot remove hook on a tensor without gradient."));
+             return self.GradVarBase()->RemoveHook(hook_id);
+           })
       .def("cpu",
            [](const std::shared_ptr<imperative::VarBase> &self) {
              if (platform::is_cpu_place(self->Place())) {
@@ -1111,6 +1200,35 @@ void BindImperative(py::module *m_ptr) {
               y = x.cuda(1)
               print(y.place)        # CUDAPlace(1)
        )DOC")
+      .def("_share_memory",
+           [](const std::shared_ptr<imperative::VarBase> &self) {
+#ifndef _WIN32
+             PADDLE_ENFORCE_EQ(
+                 platform::is_cpu_place(self->Place()), true,
+                 platform::errors::InvalidArgument(
+                     "Sharing memory only support CPU Tensor currently"));
+             // 1. get LoDTensor
+             auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
+             // 2. allocate shared memory
+             void *data_ptr = t->data<void>();
+             size_t data_size = t->numel() * framework::SizeOfType(t->type());
+             auto shared_writer_holder =
+                 memory::allocation::AllocateMemoryMapWriterAllocation(
+                     data_size);
+             // 3. maintain mmap fd set & backup ipc_name
+             const std::string &ipc_name = shared_writer_holder->ipc_name();
+             memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
+             // 4. copy data & reset holder
+             memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
+                          platform::CPUPlace(), data_ptr, data_size);
+             t->ResetHolder(shared_writer_holder);
+             return *t;
+#else
+             PADDLE_THROW(platform::errors::PermissionDenied(
+                 "Sharing memory in Windows OS is not supported currently"));
+#endif
+           },
+           py::return_value_policy::reference)
       .def("copy_", &imperative::VarBase::CopyFrom)
       .def("_copy_to",
            [](const std::shared_ptr<imperative::VarBase> &self,
@@ -1169,22 +1287,28 @@ void BindImperative(py::module *m_ptr) {
                     &imperative::VarBase::SetOverridedStopGradient)
       .def_property("persistable", &imperative::VarBase::Persistable,
                     &imperative::VarBase::SetPersistable)
-      .def_property_readonly(
-          "shape",
-          [](imperative::VarBase &self) {
-            if (self.Var().IsType<framework::LoDTensor>()) {
-              return framework::vectorize<int>(
-                  self.Var().Get<framework::LoDTensor>().dims());
-            } else if (self.Var().IsType<framework::SelectedRows>()) {
-              return framework::vectorize<int>(
-                  self.Var().Get<framework::SelectedRows>().value().dims());
-            } else {
-              VLOG(2) << "It is meaningless to get shape of "
-                         "variable type "
-                      << GetTypeName(self);
-              return std::vector<int>();
-            }
-          })
+      .def_property_readonly("shape",
+                             [](imperative::VarBase &self) {
+                               if (self.Var().IsType<framework::LoDTensor>()) {
+                                 return framework::vectorize<int>(
+                                     self.Var()
+                                         .Get<framework::LoDTensor>()
+                                         .dims());
+                               } else if (self.Var()
+                                              .IsType<
+                                                  framework::SelectedRows>()) {
+                                 return framework::vectorize<int>(
+                                     self.Var()
+                                         .Get<framework::SelectedRows>()
+                                         .value()
+                                         .dims());
+                               } else {
+                                 VLOG(2) << "It is meaningless to get shape of "
+                                            "variable type "
+                                         << GetTypeName(self);
+                                 return std::vector<int>();
+                               }
+                             })
       .def_property_readonly("is_leaf", &imperative::VarBase::IsLeaf,
                              R"DOC(
       Whether a Tensor is leaf Tensor.
@@ -1414,6 +1538,19 @@ void BindImperative(py::module *m_ptr) {
       },
       py::call_guard<py::gil_scoped_release>());
 
+  m.def(
+      "dygraph_run_backward",
+      [](const std::vector<std::shared_ptr<imperative::VarBase>> &tensors,
+         const std::vector<std::shared_ptr<imperative::VarBase>> &grad_tensors,
+         bool retain_graph, const imperative::Tracer &tracer) {
+        auto *engine = tracer.GetEngine();
+        engine->Init(tensors, grad_tensors, retain_graph);
+        VLOG(3) << "Start backward";
+        engine->Execute();
+        VLOG(3) << "Finish backward";
+      },
+      py::call_guard<py::gil_scoped_release>());
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
   py::class_<imperative::ParallelContext,
@@ -1443,7 +1580,10 @@ void BindImperative(py::module *m_ptr) {
       m, "NCCLParallelContext")
       .def(py::init<const imperative::ParallelStrategy &,
                     const platform::CUDAPlace &>())
-      .def("init", [](imperative::NCCLParallelContext &self) { self.Init(); });
+      .def("init", [](imperative::NCCLParallelContext &self) { self.Init(); })
+      .def("init_with_ring_id",
+           &imperative::NCCLParallelContext::InitWithRingID,
+           py::arg("ring_id"));
 #endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
@@ -1452,7 +1592,10 @@ void BindImperative(py::module *m_ptr) {
       m, "BKCLParallelContext")
       .def(py::init<const imperative::ParallelStrategy &,
                     const platform::XPUPlace &>())
-      .def("init", [](imperative::BKCLParallelContext &self) { self.Init(); });
+      .def("init", [](imperative::BKCLParallelContext &self) { self.Init(); })
+      .def("init_with_ring_id",
+           &imperative::BKCLParallelContext::InitWithRingID,
+           py::arg("ring_id"));
 #endif
 }
 
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 69856fa4fa142e..2c1927f49f6b70 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -16,6 +16,9 @@
 #include <fstream>
 #include <iostream>
 #include <string>
+#ifndef _WIN32
+#include <unistd.h>
+#endif
 
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -23,6 +26,9 @@
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/pybind/pybind.h"
 #include "paddle/fluid/string/string_helper.h"
+#ifdef PADDLE_WITH_ASCEND
+#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
+#endif
 
 // NOTE(zhiqiu): Commonly, the inputs in auto-generated OP function are
 // determined by the OP`s proto automatically, i.e., all the inputs registered
@@ -119,6 +125,8 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"fill_constant", {"Out"}},
     {"matmul", {"Out"}},
     {"c_broadcast", {"Out"}},
+    {"c_sync_calc_stream", {"Out"}},
+    {"c_sync_comm_stream", {"Out"}},
     {"c_allreduce_sum", {"Out"}},
     {"c_allreduce_max", {"Out"}},
     {"c_allreduce_min", {"Out"}},
@@ -559,6 +567,11 @@ int main(int argc, char* argv[]) {
     return -1;
   }
 
+#ifdef PADDLE_WITH_ASCEND
+  auto ascend_ptr = paddle::framework::AscendInstance::GetInstance();
+  ascend_ptr->InitGEForUT();
+#endif
+
   std::vector<std::string> headers{"\"paddle/fluid/imperative/tracer.h\""};
 
   std::ofstream out(argv[1], std::ios::out);
@@ -588,5 +601,9 @@ int main(int argc, char* argv[]) {
       << "} // namespace paddle\n";
 
   out.close();
+
+#ifdef PADDLE_WITH_ASCEND
+  ge::GEFinalize();
+#endif
   return 0;
 }
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d8ee80c0070e7d..5bf70d1126b892 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -143,6 +143,14 @@ bool IsCompiledWithROCM() {
 #endif
 }
 
+bool IsCompiledWithAscend() {
+#ifndef PADDLE_WITH_ASCEND
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithXPU() {
 #ifndef PADDLE_WITH_XPU
   return false;
@@ -1756,6 +1764,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("init_devices", []() { framework::InitDevices(); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_ascend", IsCompiledWithAscend);
   m.def("is_compiled_with_rocm", IsCompiledWithROCM);
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
   m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
@@ -2885,6 +2894,7 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_ASCEND
   BindAscendWrapper(&m);
   BindAscendGraph(&m);
+  BindAscendDevice(&m);
 #endif
 #ifdef PADDLE_WITH_CRYPTO
   BindCrypto(&m);
@@ -2896,6 +2906,11 @@ All parameter, weight, gradient are variables in Paddle.
   BindCommunicatorContext(&m);
   BindDistCommunicator(&m);
   BindHeterClient(&m);
+  BindGraphPyFeatureNode(&m);
+  BindGraphNode(&m);
+  BindGraphPyService(&m);
+  BindGraphPyServer(&m);
+  BindGraphPyClient(&m);
 #endif
 }
 }  // namespace pybind
diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt
deleted file mode 100644
index 0688c63cac3f3f..00000000000000
--- a/paddle/fluid/train/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-function(train_test TARGET_NAME)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs ARGS)
-    cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-
-    if (NOT APPLE AND NOT WIN32)
-        cc_test(test_train_${TARGET_NAME}
-                SRCS test_train_${TARGET_NAME}.cc
-                DEPS paddle_inference_shared
-                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
-    else()
-        cc_test(test_train_${TARGET_NAME}
-                SRCS test_train_${TARGET_NAME}.cc
-                DEPS paddle_inference_io
-                ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
-    endif()
-    if(TEST test_train_${TARGET_NAME})
-        set_tests_properties(test_train_${TARGET_NAME}
-                PROPERTIES FIXTURES_REQUIRED test_${TARGET_NAME}_infer_model)
-        if(NOT WIN32 AND NOT APPLE)
-            set_tests_properties(test_train_${TARGET_NAME}
-                    PROPERTIES TIMEOUT 150)
-        endif()
-    endif()
-endfunction(train_test)
-
-
-if(WITH_TESTING)
-  train_test(recognize_digits)
-endif()
diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt
deleted file mode 100644
index 95da77d68d482a..00000000000000
--- a/paddle/fluid/train/demo/CMakeLists.txt
+++ /dev/null
@@ -1,77 +0,0 @@
-cmake_minimum_required(VERSION 3.0)
-
-project(cpp_train_demo CXX C)
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-if(NOT DEFINED PADDLE_LIB)
-  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/paddle/lib/dir")
-endif()
-
-option(WITH_MKLDNN     "Compile PaddlePaddle with MKLDNN"                                   OFF)
-option(WITH_MKL        "Compile PaddlePaddle with MKL support, default use openblas."       OFF)
-
-include_directories("${PADDLE_LIB}")
-include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
-include_directories("${PADDLE_LIB}/third_party/install/glog/include")
-include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
-include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
-include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
-
-include_directories("${PADDLE_LIB}/third_party/boost")
-include_directories("${PADDLE_LIB}/third_party/eigen3")
-include_directories("${PADDLE_LIB}/third_party/threadpool")
-include_directories("${PADDLE_LIB}/third_party/dlpack")
-
-link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
-link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
-link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
-link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
-
-add_executable(demo_trainer demo_trainer.cc)
-
-if(WITH_MKLDNN)
-  add_definitions(-DPADDLE_WITH_MKLDNN)
-  include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include")
-  if(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib)
-  else(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
-  endif(WIN32)
-endif(WITH_MKLDNN)
-
-if(WITH_MKL)
-  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  if(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib)
-  else(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
-  endif(WIN32)
-else()
-  if(APPLE)
-    set(MATH_LIB cblas)
-  elseif(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib)
-  else()
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
-  endif(APPLE)
-endif()
-
-if(APPLE)
-  set(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
-else(APPLE)
-  set(ARCHIVE_START "-Wl,--whole-archive")
-  set(ARCHIVE_END "-Wl,--no-whole-archive")
-  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-endif(APPLE)
-
-target_link_libraries(demo_trainer
-        ${MACOS_LD_FLAGS}
-        ${ARCHIVE_START}
-        ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference.so
-        ${ARCHIVE_END}
-        ${MATH_LIB}
-        ${MKLDNN_LIB}
-        glog gflags protobuf z xxhash
-        ${EXTERNAL_LIB})
diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md
deleted file mode 100644
index 8a44c25aea9a0d..00000000000000
--- a/paddle/fluid/train/demo/README.md
+++ /dev/null
@@ -1,65 +0,0 @@
-
-### step 1. build paddle lib
-
-```
-
-# WITH_MKL=ON|OFF
-# WITH_MKLDNN=ON|OFF
-
-PADDLE_LIB=/paddle/lib/dir
-cmake .. -DPADDLE_INSTALL_DIR=$PADDLE_LIB \
-         -DCMAKE_BUILD_TYPE=Release \
-         -DWITH_GPU=OFF \
-         -DWITH_STYLE_CHECK=OFF \
-         -DWITH_MKL=OFF \
-         -DWITH_MKLDNN=OFF
-make -j8
-make -j8 fluid_lib_dist
-```
-
-### step 2. generate program desc
-```
-# please install paddle before run this scripe
-pip install --upgrade paddlepaddle-*.whl
-python demo_network.py
-```
-
-This will generate two program desc files:
-  - startup_program: used to init all parameters
-  - main_program: main logic of the network
-
-### step 3. build demo_trainer and run it.
-
-
-```
-# Make a build dir at the same dir of this README.md document.
-# The demo dir can be put anywhere.
-mkdir build
-cd build
-
-# WITH_MKL=ON|OFF
-# WITH_MKLDNN=ON|OFF
-PADDLE_LIB=/paddle/lib/dir
-
-# PADDLE_LIB is the same with PADDLE_INSTALL_DIR when building the lib
-cmake .. -DPADDLE_LIB=$PADDLE_LIB \
-         -DWITH_MKLDNN=OFF \
-         -DWITH_MKL=OFF
-make
-
-# copy startup_program and main_program to this dir
-cp ../startup_program .
-cp ../main_program .
-
-# run demo cpp trainer
-./demo_trainer
-
-```
-
-The output will be:
-```
-step: 0 loss: 1069.02
-step: 1 loss: 1069.02
-step: 2 loss: 1069.02
-....
-```
diff --git a/paddle/fluid/train/demo/demo_network.py b/paddle/fluid/train/demo/demo_network.py
deleted file mode 100644
index 41e98c6a24a750..00000000000000
--- a/paddle/fluid/train/demo/demo_network.py
+++ /dev/null
@@ -1,47 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import paddle.fluid.framework as framework
-
-
-def train_network(with_optimize):
-    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
-    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
-
-    if with_optimize:
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.00001)
-        sgd_optimizer.minimize(avg_cost)
-    else:
-        fluid.backward.append_backward(avg_cost)
-
-
-def save_program_desc(network_func):
-    startup_program = framework.Program()
-    train_program = framework.Program()
-
-    with framework.program_guard(train_program, startup_program):
-        network_func(with_optimize=False)
-
-    with open("startup_program", "w") as f:
-        f.write(startup_program.desc.serialize_to_string())
-    with open("main_program", "w") as f:
-        f.write(train_program.desc.serialize_to_string())
-
-
-save_program_desc(train_network)
diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc
deleted file mode 100644
index 830f00b8db1d5c..00000000000000
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ /dev/null
@@ -1,118 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <time.h>
-#include <fstream>
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
-
-namespace paddle {
-namespace train {
-
-void ReadBinaryFile(const std::string& filename, std::string* contents) {
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE_EQ(
-      fin.is_open(), true,
-      platform::errors::Unavailable("Failed to open file %s.", filename));
-  fin.seekg(0, std::ios::end);
-  contents->clear();
-  contents->resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(contents->at(0)), contents->size());
-  fin.close();
-}
-
-std::unique_ptr<paddle::framework::ProgramDesc> Load(
-    paddle::framework::Executor* executor, const std::string& model_filename) {
-  VLOG(3) << "loading model from " << model_filename;
-  std::string program_desc_str;
-  ReadBinaryFile(model_filename, &program_desc_str);
-
-  std::unique_ptr<paddle::framework::ProgramDesc> main_program(
-      new paddle::framework::ProgramDesc(program_desc_str));
-  return main_program;
-}
-
-}  // namespace train
-}  // namespace paddle
-
-int main() {
-  paddle::framework::InitDevices();
-
-  const auto cpu_place = paddle::platform::CPUPlace();
-
-  paddle::framework::Executor executor(cpu_place);
-  paddle::framework::Scope scope;
-  auto startup_program = paddle::train::Load(&executor, "startup_program");
-  auto train_program = paddle::train::Load(&executor, "main_program");
-
-  std::string loss_name = "";
-  for (auto op_desc : train_program->Block(0).AllOps()) {
-    if (op_desc->Type() == "mean") {
-      loss_name = op_desc->Output("Out")[0];
-      break;
-    }
-  }
-
-  PADDLE_ENFORCE_NE(loss_name, "",
-                    platform::errors::NotFound("Loss name is not found."));
-
-  // init all parameters
-  executor.Run(*startup_program, &scope, 0);
-
-  // prepare data
-  auto x_var = scope.Var("x");
-  auto x_tensor = x_var->GetMutable<paddle::framework::LoDTensor>();
-  x_tensor->Resize({2, 13});
-
-  auto x_data = x_tensor->mutable_data<float>(cpu_place);
-  for (int i = 0; i < 2 * 13; ++i) {
-    x_data[i] = static_cast<float>(i);
-  }
-
-  auto y_var = scope.Var("y");
-  auto y_tensor = y_var->GetMutable<paddle::framework::LoDTensor>();
-  y_tensor->Resize({2, 1});
-  auto y_data = y_tensor->mutable_data<float>(cpu_place);
-  for (int i = 0; i < 2 * 1; ++i) {
-    y_data[i] = static_cast<float>(i);
-  }
-
-  auto loss_var = scope.Var(loss_name);
-
-  paddle::platform::ProfilerState pf_state;
-  pf_state = paddle::platform::ProfilerState::kCPU;
-  paddle::platform::EnableProfiler(pf_state);
-  clock_t t1 = clock();
-
-  for (int i = 0; i < 10; ++i) {
-    executor.Run(*train_program, &scope, 0, false, true);
-    std::cout << "step: " << i << " loss: "
-              << loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]
-              << std::endl;
-  }
-
-  clock_t t2 = clock();
-  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal,
-                                    "run_paddle_op_profiler");
-  std::cout << "run_time = " << t2 - t1 << std::endl;
-  return 0;
-}
diff --git a/paddle/fluid/train/demo/run.sh b/paddle/fluid/train/demo/run.sh
deleted file mode 100755
index 2955e7574daa2d..00000000000000
--- a/paddle/fluid/train/demo/run.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-
-set -x
-
-PADDLE_ROOT=$1
-TURN_ON_MKL=$2 # use MKL or Openblas
-
-# download models
-function download() {
-    wget -q http://paddle-tar.bj.bcebos.com/train_demo/LR-1-7/main_program
-    wget -q http://paddle-tar.bj.bcebos.com/train_demo/LR-1-7/startup_program
-}
-
-download
-
-# build demo trainer
-paddle_install_dir=${PADDLE_ROOT}/build/paddle_install_dir
-
-mkdir -p build
-cd build
-rm -rf *
-cmake .. -DPADDLE_LIB=$paddle_install_dir \
-         -DWITH_MKLDNN=$TURN_ON_MKL \
-         -DWITH_MKL=$TURN_ON_MKL
-make
-
-cd ..
-
-# run demo trainer
-build/demo_trainer
diff --git a/paddle/fluid/train/imdb_demo/CMakeLists.txt b/paddle/fluid/train/imdb_demo/CMakeLists.txt
deleted file mode 100644
index e943d6bc78eab0..00000000000000
--- a/paddle/fluid/train/imdb_demo/CMakeLists.txt
+++ /dev/null
@@ -1,76 +0,0 @@
-cmake_minimum_required(VERSION 3.0)
-
-project(cpp_imdb_train_demo CXX C)
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-if(NOT DEFINED PADDLE_LIB)
-  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/paddle/lib/dir")
-endif()
-
-option(WITH_MKLDNN     "Compile PaddlePaddle with MKLDNN"                                   OFF)
-option(WITH_MKL        "Compile PaddlePaddle with MKL support, default use openblas."       OFF)
-
-include_directories("${PADDLE_LIB}")
-include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
-include_directories("${PADDLE_LIB}/third_party/install/glog/include")
-include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
-include_directories("${PADDLE_LIB}/third_party/install/xxhash/include")
-include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
-
-include_directories("${PADDLE_LIB}/third_party/boost")
-include_directories("${PADDLE_LIB}/third_party/eigen3")
-include_directories("${PADDLE_LIB}/third_party/threadpool")
-include_directories("${PADDLE_LIB}/third_party/dlpack")
-
-link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib")
-link_directories("${PADDLE_LIB}/third_party/install/glog/lib")
-link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
-link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
-link_directories("${PADDLE_LIB}/third_party/install/zlib/lib")
-
-add_executable(demo_trainer save_model.cc demo_trainer.cc)
-
-if(WITH_MKLDNN)
-  include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include")
-  if(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib)
-  else(WIN32)
-    set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0)
-  endif(WIN32)
-endif(WITH_MKLDNN)
-
-if(WITH_MKL)
-  include_directories("${PADDLE_LIB}/third_party/install/mklml/include")
-  if(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib)
-  else(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so)
-  endif(WIN32)
-else()
-  if(APPLE)
-    set(MATH_LIB cblas)
-  elseif(WIN32)
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib)
-  else()
-    set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a)
-  endif(APPLE)
-endif()
-
-if(APPLE)
-  set(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
-else(APPLE)
-  set(ARCHIVE_START "-Wl,--whole-archive")
-  set(ARCHIVE_END "-Wl,--no-whole-archive")
-  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
-endif(APPLE)
-
-target_link_libraries(demo_trainer
-	${MACOS_LD_FLAGS}
-	${ARCHIVE_START}
-	${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference.so
-	${ARCHIVE_END}
-	${MATH_LIB}
-	${MKLDNN_LIB}
-	glog gflags protobuf z xxhash
-	${EXTERNAL_LIB})
diff --git a/paddle/fluid/train/imdb_demo/README.md b/paddle/fluid/train/imdb_demo/README.md
deleted file mode 100644
index 28fd66710f80dd..00000000000000
--- a/paddle/fluid/train/imdb_demo/README.md
+++ /dev/null
@@ -1,115 +0,0 @@
-# Train with C++ inference API
-
-What is C++ inference API and how to install it:
-
-see: [PaddlePaddle Fluid 提供了 C++ API 来支持模型的部署上线](https://paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/deploy/inference/index_cn.html)
-
-After downloading the source code of Paddle, you can build your own inference lib:
-
-```shell
-PADDLE_ROOT=./Paddle
-cd Paddle
-mkdir build
-cd build
-cmake -DPADDLE_INFERENCE_INSTALL_DIR=$PADDLE_ROOT \
-      -DCMAKE_BUILD_TYPE=Release \
-      -DWITH_PYTHON=OFF \
-      -DWITH_MKL=OFF \
-      -DWITH_GPU=OFF  \
-      -DON_INFER=ON \
-      ..
-make
-make inference_lib_dist
-```
-
-## IMDB task
-
-see: [IMDB Dataset of 50K Movie Reviews | Kaggle](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)
-
-## Quick Start
-
-### prepare data
-
-```shell
-    wget https://fleet.bj.bcebos.com/text_classification_data.tar.gz
-    tar -zxvf text_classification_data.tar.gz
-```
-### build
-
-```shell
-    mkdir build
-    cd build
-    rm -rf *
-    PADDLE_LIB=path/to/Paddle/build/paddle_install_dir
-    cmake .. -DPADDLE_LIB=$PADDLE_LIB  -DWITH_MKLDNN=OFF -DWITH_MKL=OFF
-    make
-```
-
-### generate program description
-
-```
-    python generate_program.py bow
-```
-
-### run
-
-```shell
-   # After editing train.cfg
-   sh run.sh
-```
-
-## results
-
-Below are training logs on BOW model, the losses go down as expected.
-
-```
-WARNING: Logging before InitGoogleLogging() is written to STDERR
-I0731 22:39:06.974232 10965 demo_trainer.cc:130] Start training...
-I0731 22:39:57.395229 10965 demo_trainer.cc:164] epoch: 0; average loss: 0.405706
-I0731 22:40:50.262344 10965 demo_trainer.cc:164] epoch: 1; average loss: 0.110746
-I0731 22:41:49.731079 10965 demo_trainer.cc:164] epoch: 2; average loss: 0.0475805
-I0731 22:43:31.398355 10965 demo_trainer.cc:164] epoch: 3; average loss: 0.0233249
-I0731 22:44:58.744391 10965 demo_trainer.cc:164] epoch: 4; average loss: 0.00701507
-I0731 22:46:30.451735 10965 demo_trainer.cc:164] epoch: 5; average loss: 0.00258187
-I0731 22:48:14.396687 10965 demo_trainer.cc:164] epoch: 6; average loss: 0.00113157
-I0731 22:49:56.242744 10965 demo_trainer.cc:164] epoch: 7; average loss: 0.000698234
-I0731 22:51:11.585919 10965 demo_trainer.cc:164] epoch: 8; average loss: 0.000510136
-I0731 22:52:50.573947 10965 demo_trainer.cc:164] epoch: 9; average loss: 0.000400932
-I0731 22:54:02.686152 10965 demo_trainer.cc:164] epoch: 10; average loss: 0.000329259
-I0731 22:54:55.233342 10965 demo_trainer.cc:164] epoch: 11; average loss: 0.000278644
-I0731 22:56:15.496256 10965 demo_trainer.cc:164] epoch: 12; average loss: 0.000241055
-I0731 22:57:45.015926 10965 demo_trainer.cc:164] epoch: 13; average loss: 0.000212085
-I0731 22:59:18.419997 10965 demo_trainer.cc:164] epoch: 14; average loss: 0.000189109
-I0731 23:00:15.409077 10965 demo_trainer.cc:164] epoch: 15; average loss: 0.000170465
-I0731 23:01:38.795770 10965 demo_trainer.cc:164] epoch: 16; average loss: 0.000155051
-I0731 23:02:57.289487 10965 demo_trainer.cc:164] epoch: 17; average loss: 0.000142106
-I0731 23:03:48.032507 10965 demo_trainer.cc:164] epoch: 18; average loss: 0.000131089
-I0731 23:04:51.195230 10965 demo_trainer.cc:164] epoch: 19; average loss: 0.000121605
-I0731 23:06:27.008040 10965 demo_trainer.cc:164] epoch: 20; average loss: 0.00011336
-I0731 23:07:56.568284 10965 demo_trainer.cc:164] epoch: 21; average loss: 0.000106129
-I0731 23:09:23.948290 10965 demo_trainer.cc:164] epoch: 22; average loss: 9.97393e-05
-I0731 23:10:56.062590 10965 demo_trainer.cc:164] epoch: 23; average loss: 9.40532e-05
-I0731 23:12:23.014047 10965 demo_trainer.cc:164] epoch: 24; average loss: 8.89622e-05
-I0731 23:13:21.439818 10965 demo_trainer.cc:164] epoch: 25; average loss: 8.43784e-05
-I0731 23:14:56.171597 10965 demo_trainer.cc:164] epoch: 26; average loss: 8.02322e-05
-I0731 23:16:01.513542 10965 demo_trainer.cc:164] epoch: 27; average loss: 7.64629e-05
-I0731 23:17:18.709139 10965 demo_trainer.cc:164] epoch: 28; average loss: 7.30239e-05
-I0731 23:18:41.421555 10965 demo_trainer.cc:164] epoch: 29; average loss: 6.98716e-05
-```
-
-I trained a Bow model and a CNN model on IMDB dataset using the trainer. At the same time, I also trained the same models using traditional Python training methods. 
-Results show that the two methods achieve almost the same dev accuracy:
-
-CNN:
- 
-<img src="https://user-images.githubusercontent.com/23031310/62356234-32217300-b543-11e9-89fd-a07614904a08.png" width="300">
-
-BOW:
-
-<img src="https://user-images.githubusercontent.com/23031310/62356253-39488100-b543-11e9-9fa2-a399fc1119d6.png" width="300">
-
-I also recorded the training speed of the C++ Trainer and the python training methods, C++ trainer is quicker on CNN model: 
-
-<img src="https://user-images.githubusercontent.com/23031310/62356444-af4ce800-b543-11e9-88c8-f3bde1321ea1.png" width="300">
-
-#TODO (mapingshuo): find the reason why C++ trainer is quicker on CNN model than python method.
diff --git a/paddle/fluid/train/imdb_demo/demo_trainer.cc b/paddle/fluid/train/imdb_demo/demo_trainer.cc
deleted file mode 100644
index 6d3b8e7ca4a840..00000000000000
--- a/paddle/fluid/train/imdb_demo/demo_trainer.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <time.h>
-#include <fstream>
-
-#include "include/save_model.h"
-#include "paddle/fluid/framework/data_feed_factory.h"
-#include "paddle/fluid/framework/dataset_factory.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
-
-#include "gflags/gflags.h"
-
-DEFINE_string(filelist, "train_filelist.txt", "filelist for fluid dataset");
-DEFINE_string(data_proto_desc, "data.proto", "data feed protobuf description");
-DEFINE_string(startup_program_file, "startup_program",
-              "startup program description");
-DEFINE_string(main_program_file, "", "main program description");
-DEFINE_string(loss_name, "mean_0.tmp_0",
-              "loss tensor name in the main program");
-DEFINE_string(save_dir, "cnn_model", "directory to save trained models");
-DEFINE_int32(epoch_num, 30, "number of epochs to run when training");
-
-namespace paddle {
-namespace train {
-
-void ReadBinaryFile(const std::string& filename, std::string* contents) {
-  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE_EQ(
-      fin.is_open(), true,
-      platform::errors::Unavailable("Failed to open file %s.", filename));
-  fin.seekg(0, std::ios::end);
-  contents->clear();
-  contents->resize(fin.tellg());
-  fin.seekg(0, std::ios::beg);
-  fin.read(&(contents->at(0)), contents->size());
-  fin.close();
-}
-
-std::unique_ptr<paddle::framework::ProgramDesc> LoadProgramDesc(
-    const std::string& model_filename) {
-  VLOG(3) << "loading model from " << model_filename;
-  std::string program_desc_str;
-  ReadBinaryFile(model_filename, &program_desc_str);
-  std::unique_ptr<paddle::framework::ProgramDesc> main_program(
-      new paddle::framework::ProgramDesc(program_desc_str));
-  return main_program;
-}
-
-bool IsPersistable(const paddle::framework::VarDesc* var) {
-  if (var->Persistable() &&
-      var->GetType() != paddle::framework::proto::VarType::FEED_MINIBATCH &&
-      var->GetType() != paddle::framework::proto::VarType::FETCH_LIST &&
-      var->GetType() != paddle::framework::proto::VarType::RAW) {
-    return true;
-  }
-  return false;
-}
-
-}  // namespace train
-}  // namespace paddle
-
-int main(int argc, char* argv[]) {
-  ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true);
-
-  std::cerr << "filelist: " << FLAGS_filelist << std::endl;
-  std::cerr << "data_proto_desc: " << FLAGS_data_proto_desc << std::endl;
-  std::cerr << "startup_program_file: " << FLAGS_startup_program_file
-            << std::endl;
-  std::cerr << "main_program_file: " << FLAGS_main_program_file << std::endl;
-  std::cerr << "loss_name: " << FLAGS_loss_name << std::endl;
-  std::cerr << "save_dir: " << FLAGS_save_dir << std::endl;
-  std::cerr << "epoch_num: " << FLAGS_epoch_num << std::endl;
-
-  std::string filelist = std::string(FLAGS_filelist);
-  std::vector<std::string> file_vec;
-  std::ifstream fin(filelist);
-  if (fin) {
-    std::string filename;
-    while (fin >> filename) {
-      file_vec.push_back(filename);
-    }
-  }
-  PADDLE_ENFORCE_GE(
-      file_vec.size(), 1,
-      platform::errors::InvalidArgument(
-          "At least one file to train, but received number of file is %d.",
-          file_vec.size()));
-  paddle::framework::InitDevices();
-  const auto cpu_place = paddle::platform::CPUPlace();
-  paddle::framework::Executor executor(cpu_place);
-  paddle::framework::Scope scope;
-  auto startup_program =
-      paddle::train::LoadProgramDesc(std::string(FLAGS_startup_program_file));
-  auto main_program =
-      paddle::train::LoadProgramDesc(std::string(FLAGS_main_program_file));
-
-  executor.Run(*startup_program, &scope, 0);
-
-  std::string data_feed_desc_str;
-  paddle::train::ReadBinaryFile(std::string(FLAGS_data_proto_desc),
-                                &data_feed_desc_str);
-  VLOG(3) << "load data feed desc done.";
-  std::unique_ptr<paddle::framework::Dataset> dataset_ptr;
-  dataset_ptr =
-      paddle::framework::DatasetFactory::CreateDataset("MultiSlotDataset");
-  VLOG(3) << "initialize dataset ptr done";
-
-  // find all params
-  std::vector<std::string> param_names;
-  const paddle::framework::BlockDesc& global_block = main_program->Block(0);
-  for (auto* var : global_block.AllVars()) {
-    if (paddle::train::IsPersistable(var)) {
-      VLOG(3) << "persistable variable's name: " << var->Name();
-      param_names.push_back(var->Name());
-    }
-  }
-
-  int epoch_num = FLAGS_epoch_num;
-  std::string loss_name = FLAGS_loss_name;
-  auto loss_var = scope.Var(loss_name);
-
-  LOG(INFO) << "Start training...";
-
-  for (int epoch = 0; epoch < epoch_num; ++epoch) {
-    VLOG(3) << "Epoch:" << epoch;
-    // get reader
-    dataset_ptr->SetFileList(file_vec);
-    VLOG(3) << "set file list done";
-    dataset_ptr->SetThreadNum(1);
-    VLOG(3) << "set thread num done";
-    dataset_ptr->SetDataFeedDesc(data_feed_desc_str);
-    VLOG(3) << "set data feed desc done";
-    dataset_ptr->CreateReaders();
-    const std::vector<paddle::framework::DataFeed*> readers =
-        dataset_ptr->GetReaders();
-    PADDLE_ENFORCE_EQ(readers.size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Readers num(%d) should be equal to thread num(1).",
-                          readers.size()));
-    readers[0]->SetPlace(paddle::platform::CPUPlace());
-    const std::vector<std::string>& input_feed_names =
-        readers[0]->GetUseSlotAlias();
-    for (auto name : input_feed_names) {
-      readers[0]->AddFeedVar(scope.Var(name), name);
-    }
-    VLOG(3) << "get reader done";
-    readers[0]->Start();
-    VLOG(3) << "start a reader";
-    VLOG(3) << "readers size: " << readers.size();
-
-    int step = 0;
-    std::vector<float> loss_vec;
-
-    while (readers[0]->Next() > 0) {
-      executor.Run(*main_program, &scope, 0, false, true);
-      loss_vec.push_back(
-          loss_var->Get<paddle::framework::LoDTensor>().data<float>()[0]);
-    }
-    float average_loss =
-        accumulate(loss_vec.begin(), loss_vec.end(), 0.0) / loss_vec.size();
-
-    LOG(INFO) << "epoch: " << epoch << "; average loss: " << average_loss;
-    dataset_ptr->DestroyReaders();
-
-    // save model
-    std::string save_dir_root = FLAGS_save_dir;
-    std::string save_dir =
-        save_dir_root + "/epoch" + std::to_string(epoch) + ".model";
-    paddle::framework::save_model(main_program, &scope, param_names, save_dir,
-                                  false);
-  }
-}
diff --git a/paddle/fluid/train/imdb_demo/generate_program.py b/paddle/fluid/train/imdb_demo/generate_program.py
deleted file mode 100644
index a12282d94ddf9e..00000000000000
--- a/paddle/fluid/train/imdb_demo/generate_program.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-import paddle
-import logging
-import paddle.fluid as fluid
-
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("fluid")
-logger.setLevel(logging.INFO)
-
-
-def load_vocab(filename):
-    vocab = {}
-    with open(filename) as f:
-        wid = 0
-        for line in f:
-            vocab[line.strip()] = wid
-            wid += 1
-    vocab["<unk>"] = len(vocab)
-    return vocab
-
-
-if __name__ == "__main__":
-    vocab = load_vocab('imdb.vocab')
-    dict_dim = len(vocab)
-    model_name = sys.argv[1]
-    data = fluid.layers.data(
-        name="words", shape=[1], dtype="int64", lod_level=1)
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-
-    dataset = fluid.DatasetFactory().create_dataset()
-    dataset.set_batch_size(128)
-    dataset.set_pipe_command("python imdb_reader.py")
-
-    dataset.set_use_var([data, label])
-    desc = dataset.proto_desc
-
-    with open("data.proto", "w") as f:
-        f.write(dataset.desc())
-
-    from nets import *
-    if model_name == 'cnn':
-        logger.info("Generate program description of CNN net")
-        avg_cost, acc, prediction = cnn_net(data, label, dict_dim)
-    elif model_name == 'bow':
-        logger.info("Generate program description of BOW net")
-        avg_cost, acc, prediction = bow_net(data, label, dict_dim)
-    else:
-        logger.error("no such model: " + model_name)
-        exit(0)
-    # optimizer = fluid.optimizer.SGD(learning_rate=0.01)
-    optimizer = fluid.optimizer.Adagrad(learning_rate=0.01)
-    optimizer.minimize(avg_cost)
-
-    with open(model_name + "_main_program", "wb") as f:
-        f.write(fluid.default_main_program().desc.serialize_to_string())
-
-    with open(model_name + "_startup_program", "wb") as f:
-        f.write(fluid.default_startup_program().desc.serialize_to_string())
diff --git a/paddle/fluid/train/imdb_demo/imdb_reader.py b/paddle/fluid/train/imdb_demo/imdb_reader.py
deleted file mode 100644
index f197c95ec32171..00000000000000
--- a/paddle/fluid/train/imdb_demo/imdb_reader.py
+++ /dev/null
@@ -1,75 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import os
-import paddle
-import re
-import paddle.fluid.incubate.data_generator as dg
-
-
-class IMDBDataset(dg.MultiSlotDataGenerator):
-    def load_resource(self, dictfile):
-        self._vocab = {}
-        wid = 0
-        with open(dictfile) as f:
-            for line in f:
-                self._vocab[line.strip()] = wid
-                wid += 1
-        self._unk_id = len(self._vocab)
-        self._pattern = re.compile(r'(;|,|\.|\?|!|\s|\(|\))')
-        self.return_value = ("words", [1, 2, 3, 4, 5, 6]), ("label", [0])
-
-    def get_words_and_label(self, line):
-        send = '|'.join(line.split('|')[:-1]).lower().replace("<br />",
-                                                              " ").strip()
-        label = [int(line.split('|')[-1])]
-
-        words = [x for x in self._pattern.split(send) if x and x != " "]
-        feas = [
-            self._vocab[x] if x in self._vocab else self._unk_id for x in words
-        ]
-        return feas, label
-
-    def infer_reader(self, infer_filelist, batch, buf_size):
-        def local_iter():
-            for fname in infer_filelist:
-                with open(fname, "r") as fin:
-                    for line in fin:
-                        feas, label = self.get_words_and_label(line)
-                        yield feas, label
-
-        import paddle
-        batch_iter = paddle.batch(
-            paddle.reader.shuffle(
-                local_iter, buf_size=buf_size),
-            batch_size=batch)
-        return batch_iter
-
-    def generate_sample(self, line):
-        def memory_iter():
-            for i in range(1000):
-                yield self.return_value
-
-        def data_iter():
-            feas, label = self.get_words_and_label(line)
-            yield ("words", feas), ("label", label)
-
-        return data_iter
-
-
-if __name__ == "__main__":
-    imdb = IMDBDataset()
-    imdb.load_resource("imdb.vocab")
-    imdb.run_from_stdin()
diff --git a/paddle/fluid/train/imdb_demo/include/save_model.h b/paddle/fluid/train/imdb_demo/include/save_model.h
deleted file mode 100644
index 452052866855d2..00000000000000
--- a/paddle/fluid/train/imdb_demo/include/save_model.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <fcntl.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/message.h>
-#include <google/protobuf/text_format.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <fstream>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <vector>
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/prune.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/platform/place.h"
-
-namespace paddle {
-namespace framework {
-void save_model(const std::unique_ptr<ProgramDesc>& main_program, Scope* scope,
-                const std::vector<std::string>& param_names,
-                const std::string& model_name, bool save_combine);
-}
-}
diff --git a/paddle/fluid/train/imdb_demo/nets.py b/paddle/fluid/train/imdb_demo/nets.py
deleted file mode 100644
index a25e67e3b5d56d..00000000000000
--- a/paddle/fluid/train/imdb_demo/nets.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import time
-import numpy as np
-
-import paddle
-import paddle.fluid as fluid
-
-
-def bow_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2):
-    """
-    bow net
-    """
-    emb = fluid.layers.embedding(
-        input=data, size=[dict_dim, emb_dim], is_sparse=True)
-    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bow_tanh = fluid.layers.tanh(bow)
-    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
-    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
-    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
-
-
-def cnn_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2,
-            win_size=3):
-    """
-    conv net
-    """
-    emb = fluid.layers.embedding(
-        input=data, size=[dict_dim, emb_dim], is_sparse=True)
-    conv_3 = fluid.nets.sequence_conv_pool(
-        input=emb,
-        num_filters=hid_dim,
-        filter_size=win_size,
-        act="tanh",
-        pool_type="max")
-
-    fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2)
-
-    prediction = fluid.layers.fc(input=[fc_1], size=class_dim, act="softmax")
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
-
-
-def lstm_net(data,
-             label,
-             dict_dim,
-             emb_dim=128,
-             hid_dim=128,
-             hid_dim2=96,
-             class_dim=2,
-             emb_lr=30.0):
-    """
-    lstm net
-    """
-    emb = fluid.layers.embedding(
-        input=data,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(learning_rate=emb_lr),
-        is_sparse=True)
-
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
-
-    lstm_h, c = fluid.layers.dynamic_lstm(
-        input=fc0, size=hid_dim * 4, is_reverse=False)
-
-    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
-    lstm_max_tanh = fluid.layers.tanh(lstm_max)
-
-    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
-
-    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
-
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
-
-
-def gru_net(data,
-            label,
-            dict_dim,
-            emb_dim=128,
-            hid_dim=128,
-            hid_dim2=96,
-            class_dim=2,
-            emb_lr=400.0):
-    """
-    gru net
-    """
-    emb = fluid.layers.embedding(
-        input=data,
-        size=[dict_dim, emb_dim],
-        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
-
-    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3)
-    gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
-    gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
-    gru_max_tanh = fluid.layers.tanh(gru_max)
-    fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
-    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
-
-    cost = fluid.layers.cross_entropy(input=prediction, label=label)
-    avg_cost = fluid.layers.mean(x=cost)
-    acc = fluid.layers.accuracy(input=prediction, label=label)
-
-    return avg_cost, acc, prediction
diff --git a/paddle/fluid/train/imdb_demo/run.sh b/paddle/fluid/train/imdb_demo/run.sh
deleted file mode 100644
index f71b4bac602a9e..00000000000000
--- a/paddle/fluid/train/imdb_demo/run.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-
-set -exu
-build/demo_trainer --flagfile="train.cfg"
diff --git a/paddle/fluid/train/imdb_demo/save_model.cc b/paddle/fluid/train/imdb_demo/save_model.cc
deleted file mode 100644
index 49da550dbb7f52..00000000000000
--- a/paddle/fluid/train/imdb_demo/save_model.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "include/save_model.h"
-#include <fcntl.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
-#include <google/protobuf/message.h>
-#include <google/protobuf/text_format.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <fstream>
-#include <iostream>
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/feed_fetch_method.h"
-#include "paddle/fluid/framework/feed_fetch_type.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/prune.h"
-#include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/platform/place.h"
-
-using std::unique_ptr;
-
-namespace paddle {
-namespace framework {
-void save_model(const unique_ptr<ProgramDesc>& main_program, Scope* scope,
-                const std::vector<std::string>& param_names,
-                const std::string& model_name, bool save_combine) {
-  auto place = platform::CPUPlace();
-  const BlockDesc& global_block = main_program->Block(0);
-  std::vector<std::string> paralist;
-  for (auto* var : global_block.AllVars()) {
-    bool is_model_param = false;
-    for (auto param_name : param_names) {
-      if (var->Name() == param_name) {
-        is_model_param = true;
-        break;
-      }
-    }
-
-    if (!is_model_param) continue;
-
-    if (!save_combine) {
-      VLOG(3) << "model var name: %s" << var->Name().c_str();
-
-      paddle::framework::AttributeMap attrs;
-      attrs.insert({"file_path", model_name + "/" + var->Name()});
-      auto save_op = paddle::framework::OpRegistry::CreateOp(
-          "save", {{"X", {var->Name()}}}, {}, attrs);
-
-      save_op->Run(*scope, place);
-    } else {
-      paralist.push_back(var->Name());
-    }
-  }
-  if (save_combine) {
-    std::sort(paralist.begin(), paralist.end());
-    paddle::framework::AttributeMap attrs;
-    attrs.insert({"file_path", model_name});
-    auto save_op = paddle::framework::OpRegistry::CreateOp(
-        "save_combine", {{"X", paralist}}, {}, attrs);
-    save_op->Run(*scope, place);
-  }
-}
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/train/imdb_demo/train.cfg b/paddle/fluid/train/imdb_demo/train.cfg
deleted file mode 100644
index 1821498890be8c..00000000000000
--- a/paddle/fluid/train/imdb_demo/train.cfg
+++ /dev/null
@@ -1,7 +0,0 @@
---filelist=train_filelist.txt
---data_proto_desc=data.proto
---loss_name=mean_0.tmp_0
---startup_program_file=bow_startup_program
---main_program_file=bow_main_program
---save_dir=bow_model
---epoch_num=30
diff --git a/paddle/fluid/train/imdb_demo/train_filelist.txt b/paddle/fluid/train/imdb_demo/train_filelist.txt
deleted file mode 100644
index dcf088af417619..00000000000000
--- a/paddle/fluid/train/imdb_demo/train_filelist.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-train_data/part-0
-train_data/part-1
-train_data/part-10
-train_data/part-11
-train_data/part-2
-train_data/part-3
-train_data/part-4
-train_data/part-5
-train_data/part-6
-train_data/part-7
-train_data/part-8
-train_data/part-9
diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc
deleted file mode 100644
index 7a980cbac8b95f..00000000000000
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <time.h>
-#include <fstream>
-
-#include "gflags/gflags.h"
-#include "gtest/gtest.h"
-
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/io/fs.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/inference/io.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/init.h"
-#include "paddle/fluid/platform/place.h"
-
-DEFINE_string(dirname, "", "Directory of the train model.");
-
-namespace paddle {
-
-void Train(std::string model_dir) {
-  framework::InitDevices();
-  const auto cpu_place = platform::CPUPlace();
-  framework::Executor executor(cpu_place);
-  framework::Scope scope;
-
-  auto train_program = inference::Load(
-      &executor, &scope, model_dir + "__model_combined__.main_program",
-      model_dir + "__params_combined__");
-
-  std::string loss_name = "";
-  for (auto op_desc : train_program->Block(0).AllOps()) {
-    if (op_desc->Type() == "mean") {
-      loss_name = op_desc->Output("Out")[0];
-      break;
-    }
-  }
-
-  PADDLE_ENFORCE_NE(loss_name, "",
-                    platform::errors::NotFound("Loss name is not found."));
-
-  // prepare data
-  auto x_var = scope.Var("img");
-  auto x_tensor = x_var->GetMutable<framework::LoDTensor>();
-  x_tensor->Resize({64, 1, 28, 28});
-
-  auto x_data = x_tensor->mutable_data<float>(cpu_place);
-  for (int i = 0; i < 64 * 28 * 28; ++i) {
-    x_data[i] = 1.0;
-  }
-
-  auto y_var = scope.Var("label");
-  auto y_tensor = y_var->GetMutable<framework::LoDTensor>();
-  y_tensor->Resize({64, 1});
-  auto y_data = y_tensor->mutable_data<int64_t>(cpu_place);
-  for (int i = 0; i < 64 * 1; ++i) {
-    y_data[i] = static_cast<int64_t>(1);
-  }
-
-  auto loss_var = scope.Var(loss_name);
-  float first_loss = 0.0;
-  float last_loss = 0.0;
-  for (int i = 0; i < 100; ++i) {
-    executor.Run(*train_program, &scope, 0, false, true,
-                 {loss_name, "img", "label"});
-    if (i == 0) {
-      first_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
-    } else if (i == 99) {
-      last_loss = loss_var->Get<framework::LoDTensor>().data<float>()[0];
-    }
-  }
-  EXPECT_LT(last_loss, first_loss);
-}
-
-TEST(train, recognize_digits) {
-  CHECK(!FLAGS_dirname.empty());
-  Train(FLAGS_dirname + "recognize_digits_mlp.train.model/");
-  Train(FLAGS_dirname + "recognize_digits_conv.train.model/");
-}
-
-}  // namespace paddle
diff --git a/paddle/scripts/build_docker_images.sh b/paddle/scripts/build_docker_images.sh
index a90f0885294a9c..2b584cdca6b4ce 100644
--- a/paddle/scripts/build_docker_images.sh
+++ b/paddle/scripts/build_docker_images.sh
@@ -1,4 +1,19 @@
 #!/bin/sh
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -xe
 
 REPO="${REPO:-paddlepaddle}"
diff --git a/paddle/scripts/docker/root/.scripts/git-completion.sh b/paddle/scripts/docker/root/.scripts/git-completion.sh
index bdddef5ac2faf5..c43e88a4acd73a 100755
--- a/paddle/scripts/docker/root/.scripts/git-completion.sh
+++ b/paddle/scripts/docker/root/.scripts/git-completion.sh
@@ -1,4 +1,19 @@
 #!bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #
 # bash/zsh completion support for core Git.
 #
diff --git a/paddle/scripts/fast_install.sh b/paddle/scripts/fast_install.sh
index 1034b1c5c10435..cacec55d3bc228 100644
--- a/paddle/scripts/fast_install.sh
+++ b/paddle/scripts/fast_install.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 ## purple to echo
 function purple(){
     echo -e "\033[35m$1\033[0m"
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 2edb062ac806fd..14e62d6761f245 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -52,6 +52,8 @@ if not defined WITH_CACHE set WITH_CACHE=OFF
 if not defined WITH_UNITY_BUILD set WITH_UNITY_BUILD=OFF
 if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=%cache_dir:\=/%/inference_demo
 if not defined LOG_LEVEL set LOG_LEVEL=normal
+if not defined PRECISION_TEST set PRECISION_TEST=OFF
+if not defined NIGHTLY_MODE set PRECISION_TEST=OFF
 
 rem -------set cache build directory-----------
 rmdir build\python /s/q
@@ -359,9 +361,9 @@ if %GENERATOR% == "Ninja" (
     ninja -j %PARALLEL_PROJECT_COUNT%
 ) else (
     if "%WITH_CLCACHE%"=="OFF" (
-        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
+        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
     ) else (
-        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% paddle.sln
+        MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj
     )
 )
 
@@ -500,7 +502,15 @@ setlocal enabledelayedexpansion
 :: for /F %%# in ('cmd /C nvidia-smi -L ^|find "GPU" /C') do set CUDA_DEVICE_COUNT=%%#
 set CUDA_DEVICE_COUNT=1
 
-%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST%
+echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^
+-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^
+-DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^
+-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^
+-DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^
+-DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% >> %work_dir%\win_cmake.sh
+set FLAGS_fraction_of_gpu_memory_to_use=0.92
+
+%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST% %WITH_GPU%
 
 goto:eof
 
@@ -509,7 +519,7 @@ echo    ========================================
 echo    Running CPU unit tests in parallel way ...
 echo    ========================================
 
-ctest.exe -E "(%disable_ut_quickly%)" -LE %nightly_label% --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
+%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST% %WITH_GPU%
 
 goto:eof
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 7f184f189860d4..2df9e0198ee49f 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -404,7 +404,7 @@ EOF
         tar -czf paddle_inference.tgz paddle_inference
         buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference.tgz |awk '{print $1}')
         echo "Paddle_Inference Size: $buildSize"
-        echo "ipipe_log_param_Paddle_Inference_Size: $buildSize"
+        echo "ipipe_log_param_Paddle_Inference_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt
     else
         SYSTEM=`uname -s`
         if [ "$SYSTEM" == "Darwin" ]; then
@@ -414,10 +414,10 @@ EOF
         fi
         buildSize=$($com ${PADDLE_ROOT}/build |awk '{print $1}')
         echo "Build Size: $buildSize"
-        echo "ipipe_log_param_Build_Size: $buildSize"
+        echo "ipipe_log_param_Build_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt
         PR_whlSize=$($com ${PADDLE_ROOT}/build/python/dist |awk '{print $1}')
         echo "PR whl Size: $PR_whlSize"
-        echo "ipipe_log_param_PR_whl_Size: $PR_whlSize"
+        echo "ipipe_log_param_PR_whl_Size: $PR_whlSize" >> ${PADDLE_ROOT}/build/build_summary.txt
     fi
 }
 
@@ -442,7 +442,7 @@ function cmake_gen_and_build() {
     build $2
     endTime_s=`date +%s`
     echo "Build Time: $[ $endTime_s - $startTime_s ]s"
-    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s"
+    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
 }
 
 function build_mac() {
@@ -480,7 +480,7 @@ function cmake_gen_and_build_mac() {
     build_mac
     endTime_s=`date +%s`
     echo "Build Time: $[ $endTime_s - $startTime_s ]s"
-    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s"
+    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
 }
 
 function run_test() {
@@ -684,7 +684,7 @@ EOF
         #mactest_error=$?
         ut_endTime_s=`date +%s`
         echo "Mac testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
-        echo "ipipe_log_param_Mac_TestCases_Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
+        echo "ipipe_log_param_Mac_TestCases_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
         paddle version
         # Recovery proxy to avoid failure in later steps
         set +x
@@ -991,12 +991,12 @@ function case_count(){
 EOF
     testcases=$1
     num=$(echo $testcases|grep -o '\^'|wc -l)
-    if [ "$2" == "" ]; then
+    if (( $2 == -1 )); then
         echo "exclusive TestCases count is $num"
-        echo "ipipe_log_param_Exclusive_TestCases_Count: $num"
+        echo "ipipe_log_param_Exclusive_TestCases_Count: $num" >> ${PADDLE_ROOT}/build/build_summary.txt
     else
         echo "$2 card TestCases count is $num"
-        echo "ipipe_log_param_${2}_Cards_TestCases_Count: $num"
+        echo "ipipe_log_param_${2}_Cards_TestCases_Count: $num" >> ${PADDLE_ROOT}/build/build_summary.txt
     fi
 }
 
@@ -1034,6 +1034,11 @@ function card_test() {
     set -m
     case_count $1 $2
     ut_startTime_s=`date +%s` 
+
+    testcases=$1
+    cardnumber=$2
+    parallel_level_base=${CTEST_PARALLEL_LEVEL:-1}
+
     # get the CUDA device count, XPU device count is one
     if [ "${WITH_XPU}" == "ON" ];then
         CUDA_DEVICE_COUNT=1
@@ -1043,20 +1048,13 @@ function card_test() {
         CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
     fi
 
-    testcases=$1
-    parallel_level_base=${CTEST_PARALLEL_LEVEL:-1}
-    if (( $# > 1 )); then
-        cardnumber=$2
-        if (( $cardnumber > $CUDA_DEVICE_COUNT )); then
-            cardnumber=$CUDA_DEVICE_COUNT
-        fi
-        if (( $# > 2 )); then
-            parallel_job=`expr $3 \* $parallel_level_base`
-        else
-            parallel_job=$parallel_level_base
-        fi
-    else
+    if (( $cardnumber == -1 ));then
         cardnumber=$CUDA_DEVICE_COUNT
+    fi
+
+    if (( $# > 2 )); then
+        parallel_job=`expr $3 \* $parallel_level_base`
+    else
         parallel_job=$parallel_level_base
     fi
 
@@ -1098,12 +1096,12 @@ function card_test() {
     done
     wait; # wait for all subshells to finish
     ut_endTime_s=`date +%s`
-    if [ "$2" == "" ]; then
+    if (( $2 == -1 )); then
         echo "exclusive TestCases Total Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
-        echo "ipipe_log_param_Exclusive_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
+        echo "ipipe_log_param_Exclusive_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
     else
         echo "$2 card TestCases Total Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
-        echo "ipipe_log_param_${2}_Cards_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
+        echo "ipipe_log_param_${2}_Cards_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
     fi
     set +m
 }
@@ -1153,13 +1151,18 @@ set -x
 set +x
         EXIT_CODE=0;
         test_cases=$(ctest -N -V) # get all test cases
-        single_card_tests_eight_parallel='^job$'    # cases list which would run 8 job each time with single GPU
-        single_card_tests_tetrad_parallel='^job$'   # cases list which would run 4 job each time with single GPU
-        single_card_tests_non_parallel_1='^job$'    # cases list which would run 1 job each time with single GPU
-        single_card_tests_non_parallel_2='^job$'    # cases list which would run 1 job each time with single GPU
-        single_card_tests='^job$' # all cases list which would take one graph card
-        exclusive_tests=''        # cases list which would be run exclusively
-        multiple_card_tests=''    # cases list which would take multiple GPUs, most cases would be two GPUs
+        # Note(zhouwei): Parallel runs are relative to 'CTEST_PARALLEL_LEVEL', e.g: '4 job each time' means 4*CTEST_PARALLEL_LEVEL
+        single_card_tests_high_parallel='^job$'     # cases list which would run the most job each time with single GPU
+        single_card_tests_two_parallel='^job$'      # cases list which would run 2 job each time with single GPU
+        single_card_tests_non_parallel='^job$'      # cases list which would run 1 job each time with single GPU
+        single_card_tests='^job$'                   # all cases list which would take single GPU
+        
+        multiple_card_tests_two_parallel='^job$'    # cases list which would run 2 job each time with multiple GPUs, most cases would be two GPUs
+        multiple_card_tests_non_parallel='^job$'    # cases list which would run 1 job each time with multiple GPUs, most cases would be two GPUs
+        
+        exclusive_tests_two_parallel='^job$'        # cases list which would run 2 job exclusively(with all GPUs)
+        exclusive_tests_non_parallel='^job$'        # cases list which would run 1 job exclusively(with all GPUs)
+        
         is_exclusive=''           # indicate whether the case is exclusive type
         is_multicard=''           # indicate whether the case is multiple GPUs type
         is_nightly=''             # indicate whether the case will only run at night
@@ -1167,9 +1170,10 @@ set +x
 
         UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
         output=$(python ${PADDLE_ROOT}/tools/parallel_UT_rule.py "${UT_list}")
-        eight_parallel_job=$(echo $output | cut -d ";" -f 1)
-        tetrad_parallel_jog=$(echo $output | cut -d ";" -f 2)
-        non_parallel_job=$(echo $output | cut -d ";" -f 3)
+        cpu_parallel_job=$(echo $output | cut -d ";" -f 1)
+        tetrad_parallel_job=$(echo $output | cut -d ";" -f 2)
+        two_parallel_job=$(echo $output | cut -d ";" -f 3)
+        non_parallel_job=$(echo $output | cut -d ";" -f 4)
         while read -r line; do
             if [[ "$line" == "" ]]; then
                 continue
@@ -1211,26 +1215,24 @@ set +x
                 fi
 
                 if [[ "$is_exclusive" != "" ]]; then
-                    if [[ "$exclusive_tests" == "" ]]; then
-                        exclusive_tests="^$testcase$"
+                    if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then
+                        exclusive_tests_two_parallel="$exclusive_tests_two_parallel|^$testcase$"
                     else
-                        exclusive_tests="$exclusive_tests|^$testcase$"
+                        exclusive_tests_non_parallel="$exclusive_tests_non_parallel|^$testcase$"
                     fi
                 elif [[ "$is_multicard" != "" ]]; then
-                    if [[ "$multiple_card_tests" == "" ]]; then
-                        multiple_card_tests="^$testcase$"
+                    if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then
+                        multiple_card_tests_two_parallel="$multiple_card_tests_two_parallel|^$testcase$"
                     else
-                        multiple_card_tests="$multiple_card_tests|^$testcase$"
+                        multiple_card_tests_non_parallel="$multiple_card_tests_non_parallel|^$testcase$"
                     fi
                 else
-                    if [[ $(echo $eight_parallel_job | grep $testcase) != "" ]]; then
-                        single_card_tests_eight_parallel="$single_card_tests_eight_parallel|^$testcase$"
-                    elif [[ $(echo $tetrad_parallel_jog | grep $testcase) != "" ]]; then
-                        single_card_tests_tetrad_parallel="$single_card_tests_tetrad_parallel|^$testcase$"
-                    elif [[ "${#single_card_tests_non_parallel_1}" -gt 10000 ]];then
-                        single_card_tests_non_parallel_2="$single_card_tests_non_parallel_2|^$testcase$"
+                    if [[ $(echo $cpu_parallel_job | grep -o $testcase) != "" ]]; then
+                        single_card_tests_high_parallel="$single_card_tests_high_parallel|^$testcase$"
+                    elif [[ $(echo $tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then
+                        single_card_tests_two_parallel="$single_card_tests_two_parallel|^$testcase$"
                     else
-                        single_card_tests_non_parallel_1="$single_card_tests_non_parallel_1|^$testcase$"
+                        single_card_tests_non_parallel="$single_card_tests_non_parallel|^$testcase$"
                     fi
                     single_card_tests="$single_card_tests|^$testcase$"
                 fi
@@ -1241,12 +1243,13 @@ set +x
                 testcase=''
         done <<< "$test_cases";
 
-        card_test "$single_card_tests_eight_parallel" 1 8     # run cases 8 job each time with single GPU
-        card_test "$single_card_tests_tetrad_parallel" 1 4    # run cases 4 job each time with single GPU
-        card_test "$single_card_tests_non_parallel_1" 1       # run cases 1 job each time with single GPU
-        card_test "$single_card_tests_non_parallel_2" 1       # run cases 1 job each time with single GPU
-        card_test "$multiple_card_tests" 2    # run cases with two GPUs
-        card_test "$exclusive_tests"          # run cases exclusively, in this cases would be run with 4/8 GPUs
+        card_test "$single_card_tests_high_parallel" 1 8        # run cases the most each time with single GPU
+        card_test "$single_card_tests_two_parallel" 1 2         # run cases 2 job each time with single GPU
+        card_test "$single_card_tests_non_parallel" 1           # run cases 1 job each time with single GPU
+        card_test "$multiple_card_tests_two_parallel" 2 2       # run cases 2 job each time with two GPUs
+        card_test "$multiple_card_tests_non_parallel" 2         # run cases 1 job each time with two GPUs
+        card_test "$exclusive_tests_two_parallel" -1 2          # run cases exclusively, in this cases would be run with 2/4/8 GPUs
+        card_test "$exclusive_tests_non_parallel" -1            # run cases exclusively, in this cases would be run with 2/4/8 GPUs
         collect_failed_tests
         rm -f $tmp_dir/*
         exec_times=0
@@ -1319,7 +1322,7 @@ set +x
                         fi
 
                         if [[ "$exclusive_retry" != "" ]]; then
-                            card_test "$exclusive_retry"
+                            card_test "$exclusive_retry" -1
                         fi
                         
                         exec_times=$[$exec_times+1]
@@ -1445,7 +1448,7 @@ function parallel_test() {
     fi
     ut_total_endTime_s=`date +%s`
     echo "TestCases Total Time: $[ $ut_total_endTime_s - $ut_total_startTime_s ]s"
-    echo "ipipe_log_param_TestCases_Total_Time: $[ $ut_total_endTime_s - $ut_total_startTime_s ]s"
+    echo "ipipe_log_param_TestCases_Total_Time: $[ $ut_total_endTime_s - $ut_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
 }
 
 function enable_unused_var_check() {
@@ -1725,7 +1728,7 @@ EOF
     fi
     endTime_s=`date +%s`
     echo "Build Time: $[ $endTime_s - $startTime_s ]s"
-    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s"
+    echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
 
     build_size "paddle_inference"
 }
@@ -1757,7 +1760,7 @@ EOF
     EXIT_CODE=$?
     fluid_endTime_s=`date +%s`
     echo "test_fluid_lib Total Time: $[ $fluid_endTime_s - $fluid_startTime_s ]s"
-    echo "ipipe_log_param_Test_Fluid_Lib_Total_Time: $[ $fluid_endTime_s - $fluid_startTime_s ]s"          
+    echo "ipipe_log_param_Test_Fluid_Lib_Total_Time: $[ $fluid_endTime_s - $fluid_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt     
     ./clean.sh
     if [[ "$EXIT_CODE" != "0" ]]; then
         exit 8;
@@ -1804,7 +1807,7 @@ function example() {
 function collect_ccache_hits() {
     rate=$(ccache -s | grep 'cache hit rate' | awk '{print $4}')
     echo "ccache hit rate: ${rate}%"
-    echo "ipipe_log_param_Ccache_Hit_Rate: ${rate}%"
+    echo "ipipe_log_param_Ccache_Hit_Rate: ${rate}%" >> ${PADDLE_ROOT}/build/build_summary.txt
 }
 
 
@@ -2026,6 +2029,12 @@ function main() {
         exit 1
         ;;
       esac
+      set +x
+      if [[ -f ${PADDLE_ROOT}/build/build_summary.txt ]];then
+        echo "=====================build summary======================"
+        cat ${PADDLE_ROOT}/build/build_summary.txt
+        echo "========================================================"
+      fi
       echo "paddle_build script finished as expected"
 }
 
diff --git a/patches/eigen/Meta.h b/patches/eigen/Meta.h
new file mode 100755
index 00000000000000..b7b789a19c4e9a
--- /dev/null
+++ b/patches/eigen/Meta.h
@@ -0,0 +1,806 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_META_H
+#define EIGEN_META_H
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+
+ #include <cfloat>
+
+ #if defined(EIGEN_CUDA_ARCH)
+  #include <math_constants.h>
+ #endif
+
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+  #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
+  #endif
+
+#endif
+
+// Recent versions of ICC require <cstdint> for pointer types below.
+#define EIGEN_ICC_NEEDS_CSTDINT (EIGEN_COMP_ICC>=1600 && EIGEN_COMP_CXXVER >= 11)
+
+// Define portable (u)int{32,64} types
+#if EIGEN_HAS_CXX11 || EIGEN_ICC_NEEDS_CSTDINT
+#include <cstdint>
+namespace Eigen {
+namespace numext {
+typedef std::uint8_t  uint8_t;
+typedef std::int8_t   int8_t;
+typedef std::uint16_t uint16_t;
+typedef std::int16_t  int16_t;
+typedef std::uint32_t uint32_t;
+typedef std::int32_t  int32_t;
+typedef std::uint64_t uint64_t;
+typedef std::int64_t  int64_t;
+}
+}
+#else
+// Without c++11, all compilers able to compile Eigen also
+// provide the C99 stdint.h header file.
+#include <stdint.h>
+namespace Eigen {
+namespace numext {
+typedef ::uint8_t  uint8_t;
+typedef ::int8_t   int8_t;
+typedef ::uint16_t uint16_t;
+typedef ::int16_t  int16_t;
+typedef ::uint32_t uint32_t;
+typedef ::int32_t  int32_t;
+typedef ::uint64_t uint64_t;
+typedef ::int64_t  int64_t;
+}
+}
+#endif
+
+namespace Eigen {
+
+typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex;
+
+/**
+ * \brief The Index type as used for the API.
+ * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
+ * \sa \blank \ref TopicPreprocessorDirectives, StorageIndex.
+ */
+
+typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index;
+
+namespace internal {
+
+/** \internal
+  * \file Meta.h
+  * This file contains generic metaprogramming classes which are not specifically related to Eigen.
+  * \note In case you wonder, yes we're aware that Boost already provides all these features,
+  * we however don't want to add a dependency to Boost.
+  */
+
+// Only recent versions of ICC complain about using ptrdiff_t to hold pointers,
+// and older versions do not provide *intptr_t types.
+#if EIGEN_ICC_NEEDS_CSTDINT
+typedef std::intptr_t  IntPtr;
+typedef std::uintptr_t UIntPtr;
+#else
+typedef std::ptrdiff_t IntPtr;
+typedef std::size_t UIntPtr;
+#endif
+#undef EIGEN_ICC_NEEDS_CSTDINT
+
+struct true_type {  enum { value = 1 }; };
+struct false_type { enum { value = 0 }; };
+
+template<bool Condition>
+struct bool_constant;
+
+template<>
+struct bool_constant<true> : true_type {};
+
+template<>
+struct bool_constant<false> : false_type {};
+
+template<bool Condition, typename Then, typename Else>
+struct conditional { typedef Then type; };
+
+template<typename Then, typename Else>
+struct conditional <false, Then, Else> { typedef Else type; };
+
+template<typename T> struct remove_reference { typedef T type; };
+template<typename T> struct remove_reference<T&> { typedef T type; };
+
+template<typename T> struct remove_pointer { typedef T type; };
+template<typename T> struct remove_pointer<T*> { typedef T type; };
+template<typename T> struct remove_pointer<T*const> { typedef T type; };
+
+template <class T> struct remove_const { typedef T type; };
+template <class T> struct remove_const<const T> { typedef T type; };
+template <class T> struct remove_const<const T[]> { typedef T type[]; };
+template <class T, unsigned int Size> struct remove_const<const T[Size]> { typedef T type[Size]; };
+
+template<typename T> struct remove_all { typedef T type; };
+template<typename T> struct remove_all<const T>   { typedef typename remove_all<T>::type type; };
+template<typename T> struct remove_all<T const&>  { typedef typename remove_all<T>::type type; };
+template<typename T> struct remove_all<T&>        { typedef typename remove_all<T>::type type; };
+template<typename T> struct remove_all<T const*>  { typedef typename remove_all<T>::type type; };
+template<typename T> struct remove_all<T*>        { typedef typename remove_all<T>::type type; };
+
+template<typename T> struct is_arithmetic      { enum { value = false }; };
+template<> struct is_arithmetic<float>         { enum { value = true }; };
+template<> struct is_arithmetic<double>        { enum { value = true }; };
+template<> struct is_arithmetic<long double>   { enum { value = true }; };
+template<> struct is_arithmetic<bool>          { enum { value = true }; };
+template<> struct is_arithmetic<char>          { enum { value = true }; };
+template<> struct is_arithmetic<signed char>   { enum { value = true }; };
+template<> struct is_arithmetic<unsigned char> { enum { value = true }; };
+template<> struct is_arithmetic<signed short>  { enum { value = true }; };
+template<> struct is_arithmetic<unsigned short>{ enum { value = true }; };
+template<> struct is_arithmetic<signed int>    { enum { value = true }; };
+template<> struct is_arithmetic<unsigned int>  { enum { value = true }; };
+template<> struct is_arithmetic<signed long>   { enum { value = true }; };
+template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
+
+template<typename T, typename U> struct is_same { enum { value = 0 }; };
+template<typename T> struct is_same<T,T> { enum { value = 1 }; };
+
+template< class T >
+struct is_void : is_same<void, typename remove_const<T>::type> {};
+
+#if EIGEN_HAS_CXX11
+template<> struct is_arithmetic<signed long long>   { enum { value = true }; };
+template<> struct is_arithmetic<unsigned long long> { enum { value = true }; };
+using std::is_integral;
+#else
+template<typename T> struct is_integral               { enum { value = false }; };
+template<> struct is_integral<bool>                   { enum { value = true }; };
+template<> struct is_integral<char>                   { enum { value = true }; };
+template<> struct is_integral<signed char>            { enum { value = true }; };
+template<> struct is_integral<unsigned char>          { enum { value = true }; };
+template<> struct is_integral<signed short>           { enum { value = true }; };
+template<> struct is_integral<unsigned short>         { enum { value = true }; };
+template<> struct is_integral<signed int>             { enum { value = true }; };
+template<> struct is_integral<unsigned int>           { enum { value = true }; };
+template<> struct is_integral<signed long>            { enum { value = true }; };
+template<> struct is_integral<unsigned long>          { enum { value = true }; };
+#if EIGEN_COMP_MSVC
+template<> struct is_integral<signed __int64>         { enum { value = true }; };
+template<> struct is_integral<unsigned __int64>       { enum { value = true }; };
+#endif
+#endif
+
+#if EIGEN_HAS_CXX11
+using std::make_unsigned;
+#else
+// TODO: Possibly improve this implementation of make_unsigned.
+// It is currently used only by
+// template<typename Scalar> struct random_default_impl<Scalar, false, true>.
+template<typename> struct make_unsigned;
+template<> struct make_unsigned<char>             { typedef unsigned char type; };
+template<> struct make_unsigned<signed char>      { typedef unsigned char type; };
+template<> struct make_unsigned<unsigned char>    { typedef unsigned char type; };
+template<> struct make_unsigned<signed short>     { typedef unsigned short type; };
+template<> struct make_unsigned<unsigned short>   { typedef unsigned short type; };
+template<> struct make_unsigned<signed int>       { typedef unsigned int type; };
+template<> struct make_unsigned<unsigned int>     { typedef unsigned int type; };
+template<> struct make_unsigned<signed long>      { typedef unsigned long type; };
+template<> struct make_unsigned<unsigned long>    { typedef unsigned long type; };
+#if EIGEN_COMP_MSVC
+template<> struct make_unsigned<signed __int64>   { typedef unsigned __int64 type; };
+template<> struct make_unsigned<unsigned __int64> { typedef unsigned __int64 type; };
+#endif
+
+// Some platforms define int64_t as long long even for C++03. In this case we
+// are missing the definition for make_unsigned. If we just define it, we get
+// duplicated definitions for platforms defining int64_t as signed long for
+// C++03. We therefore add the specialization for C++03 long long for these
+// platforms only.
+#if EIGEN_OS_MAC
+template<> struct make_unsigned<unsigned long long> { typedef unsigned long long type; };
+template<> struct make_unsigned<long long>          { typedef unsigned long long type; };
+#endif
+#endif
+
+template <typename T> struct add_const { typedef const T type; };
+template <typename T> struct add_const<T&> { typedef T& type; };
+
+template <typename T> struct is_const { enum { value = 0 }; };
+template <typename T> struct is_const<T const> { enum { value = 1 }; };
+
+template<typename T> struct add_const_on_value_type            { typedef const T type;  };
+template<typename T> struct add_const_on_value_type<T&>        { typedef T const& type; };
+template<typename T> struct add_const_on_value_type<T*>        { typedef T const* type; };
+template<typename T> struct add_const_on_value_type<T* const>  { typedef T const* const type; };
+template<typename T> struct add_const_on_value_type<T const* const>  { typedef T const* const type; };
+
+#if EIGEN_HAS_CXX11
+
+using std::is_convertible;
+
+#else
+
+template<typename From, typename To>
+struct is_convertible_impl
+{
+private:
+  struct any_conversion
+  {
+    template <typename T> any_conversion(const volatile T&);
+    template <typename T> any_conversion(T&);
+  };
+  struct yes {int a[1];};
+  struct no  {int a[2];};
+
+  template<typename T>
+  static yes test(T, int);
+
+  template<typename T>
+  static no  test(any_conversion, ...);
+
+public:
+  static typename internal::remove_reference<From>::type* ms_from;
+#ifdef __INTEL_COMPILER
+  #pragma warning push
+  #pragma warning ( disable : 2259 )
+#endif
+  enum { value = sizeof(test<To>(*ms_from, 0))==sizeof(yes) };
+#ifdef __INTEL_COMPILER
+  #pragma warning pop
+#endif
+};
+
+template<typename From, typename To>
+struct is_convertible
+{
+  enum { value = is_convertible_impl<From,To>::value };
+};
+
+template<typename T>
+struct is_convertible<T,T&> { enum { value = false }; };
+
+template<typename T>
+struct is_convertible<const T,const T&> { enum { value = true }; };
+
+#endif
+
+/** \internal Allows to enable/disable an overload
+  * according to a compile time condition.
+  */
+template<bool Condition, typename T=void> struct enable_if;
+
+template<typename T> struct enable_if<true,T>
+{ typedef T type; };
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if !defined(__FLT_EPSILON__)
+#define __FLT_EPSILON__ FLT_EPSILON
+#define __DBL_EPSILON__ DBL_EPSILON
+#endif
+
+namespace device {
+
+template<typename T> struct numeric_limits
+{
+  EIGEN_DEVICE_FUNC static T epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC static T (max)() { assert(false && "Highest not supported for this type"); }
+  EIGEN_DEVICE_FUNC static T (min)() { assert(false && "Lowest not supported for this type"); }
+  EIGEN_DEVICE_FUNC static T infinity() { assert(false && "Infinity not supported for this type"); }
+  EIGEN_DEVICE_FUNC static T quiet_NaN() { assert(false && "quiet_NaN not supported for this type"); }
+};
+template<> struct numeric_limits<float>
+{
+  EIGEN_DEVICE_FUNC
+  static float epsilon() { return __FLT_EPSILON__; }
+  EIGEN_DEVICE_FUNC
+  static float (max)() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_MAX_NORMAL_F;
+  #else
+    return HIPRT_MAX_NORMAL_F;
+  #endif
+  }
+  EIGEN_DEVICE_FUNC
+  static float (min)() { return FLT_MIN; }
+  EIGEN_DEVICE_FUNC
+  static float infinity() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_INF_F;
+  #else
+    return HIPRT_INF_F;
+  #endif
+  }
+  EIGEN_DEVICE_FUNC
+  static float quiet_NaN() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_NAN_F;
+  #else
+    return HIPRT_NAN_F;
+  #endif
+  }
+};
+template<> struct numeric_limits<double>
+{
+  EIGEN_DEVICE_FUNC
+  static double epsilon() { return __DBL_EPSILON__; }
+  EIGEN_DEVICE_FUNC
+  static double (max)() { return DBL_MAX; }
+  EIGEN_DEVICE_FUNC
+  static double (min)() { return DBL_MIN; }
+  EIGEN_DEVICE_FUNC
+  static double infinity() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_INF;
+  #else
+    return HIPRT_INF;
+  #endif
+  }
+  EIGEN_DEVICE_FUNC
+  static double quiet_NaN() {
+  #if defined(EIGEN_CUDA_ARCH)
+    return CUDART_NAN;
+  #else
+    return HIPRT_NAN;
+  #endif
+  }
+};
+template<> struct numeric_limits<int>
+{
+  EIGEN_DEVICE_FUNC
+  static int epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static int (max)() { return INT_MAX; }
+  EIGEN_DEVICE_FUNC
+  static int (min)() { return INT_MIN; }
+};
+template<> struct numeric_limits<unsigned int>
+{
+  EIGEN_DEVICE_FUNC
+  static unsigned int epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static unsigned int (max)() { return UINT_MAX; }
+  EIGEN_DEVICE_FUNC
+  static unsigned int (min)() { return 0; }
+};
+template<> struct numeric_limits<long>
+{
+  EIGEN_DEVICE_FUNC
+  static long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static long (max)() { return LONG_MAX; }
+  EIGEN_DEVICE_FUNC
+  static long (min)() { return LONG_MIN; }
+};
+template<> struct numeric_limits<unsigned long>
+{
+  EIGEN_DEVICE_FUNC
+  static unsigned long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static unsigned long (max)() { return ULONG_MAX; }
+  EIGEN_DEVICE_FUNC
+  static unsigned long (min)() { return 0; }
+};
+template<> struct numeric_limits<long long>
+{
+  EIGEN_DEVICE_FUNC
+  static long long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static long long (max)() { return LLONG_MAX; }
+  EIGEN_DEVICE_FUNC
+  static long long (min)() { return LLONG_MIN; }
+};
+template<> struct numeric_limits<unsigned long long>
+{
+  EIGEN_DEVICE_FUNC
+  static unsigned long long epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC
+  static unsigned long long (max)() { return ULLONG_MAX; }
+  EIGEN_DEVICE_FUNC
+  static unsigned long long (min)() { return 0; }
+};
+template<> struct numeric_limits<bool>
+{
+  EIGEN_DEVICE_FUNC
+  static bool epsilon() { return false; }
+  EIGEN_DEVICE_FUNC
+  static bool (max)() { return true; }
+  EIGEN_DEVICE_FUNC
+  static bool (min)() { return false; }
+};
+
+}
+
+#endif
+
+/** \internal
+  * A base class do disable default copy ctor and copy assignment operator.
+  */
+class noncopyable
+{
+  EIGEN_DEVICE_FUNC noncopyable(const noncopyable&);
+  EIGEN_DEVICE_FUNC const noncopyable& operator=(const noncopyable&);
+protected:
+  EIGEN_DEVICE_FUNC noncopyable() {}
+  EIGEN_DEVICE_FUNC ~noncopyable() {}
+};
+
+/** \internal
+  * Provides access to the number of elements in the object of as a compile-time constant expression.
+  * It "returns" Eigen::Dynamic if the size cannot be resolved at compile-time (default).
+  *
+  * Similar to std::tuple_size, but more general.
+  *
+  * It currently supports:
+  *  - any types T defining T::SizeAtCompileTime
+  *  - plain C arrays as T[N]
+  *  - std::array (c++11)
+  *  - some internal types such as SingleRange and AllRange
+  *
+  * The second template parameter eases SFINAE-based specializations.
+  */
+template<typename T, typename EnableIf = void> struct array_size {
+  enum { value = Dynamic };
+};
+
+template<typename T> struct array_size<T,typename internal::enable_if<((T::SizeAtCompileTime&0)==0)>::type> {
+  enum { value = T::SizeAtCompileTime };
+};
+
+template<typename T, int N> struct array_size<const T (&)[N]> {
+  enum { value = N };
+};
+template<typename T, int N> struct array_size<T (&)[N]> {
+  enum { value = N };
+};
+
+#if EIGEN_HAS_CXX11
+template<typename T, std::size_t N> struct array_size<const std::array<T,N> > {
+  enum { value = N };
+};
+template<typename T, std::size_t N> struct array_size<std::array<T,N> > {
+  enum { value = N };
+};
+#endif
+
+/** \internal
+  * Analogue of the std::size free function.
+  * It returns the size of the container or view \a x of type \c T
+  *
+  * It currently supports:
+  *  - any types T defining a member T::size() const
+  *  - plain C arrays as T[N]
+  *
+  */
+template<typename T>
+Index size(const T& x) { return x.size(); }
+
+template<typename T,std::size_t N>
+Index size(const T (&) [N]) { return N; }
+
+/** \internal
+  * Convenient struct to get the result type of a nullary, unary, binary, or
+  * ternary functor.
+  * 
+  * Pre C++11:
+  * Supports both a Func::result_type member and templated
+  * Func::result<Func(ArgTypes...)>::type member.
+  * 
+  * If none of these members is provided, then the type of the first
+  * argument is returned.
+  * 
+  * Post C++11:
+  * This uses std::result_of. However, note the `type` member removes
+  * const and converts references/pointers to their corresponding value type.
+  */
+#if EIGEN_HAS_STD_INVOKE_RESULT
+template<typename T> struct result_of;
+
+template<typename F, typename... ArgTypes>
+struct result_of<F(ArgTypes...)> {
+  typedef typename std::invoke_result<F, ArgTypes...>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#elif EIGEN_HAS_STD_RESULT_OF
+template<typename T> struct result_of {
+  typedef typename std::result_of<T>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#else
+template<typename T> struct result_of { };
+
+struct has_none {int a[1];};
+struct has_std_result_type {int a[2];};
+struct has_tr1_result {int a[3];};
+
+template<typename Func, int SizeOf>
+struct nullary_result_of_select {};
+
+template<typename Func>
+struct nullary_result_of_select<Func, sizeof(has_std_result_type)> {typedef typename Func::result_type type;};
+
+template<typename Func>
+struct nullary_result_of_select<Func, sizeof(has_tr1_result)> {typedef typename Func::template result<Func()>::type type;};
+
+template<typename Func>
+struct result_of<Func()> {
+    template<typename T>
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T()>::type const * = 0);
+    static has_none               testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename nullary_result_of_select<Func, FunctorType>::type type;
+};
+
+template<typename Func, typename ArgType, int SizeOf=sizeof(has_none)>
+struct unary_result_of_select {typedef typename internal::remove_all<ArgType>::type type;};
+
+template<typename Func, typename ArgType>
+struct unary_result_of_select<Func, ArgType, sizeof(has_std_result_type)> {typedef typename Func::result_type type;};
+
+template<typename Func, typename ArgType>
+struct unary_result_of_select<Func, ArgType, sizeof(has_tr1_result)> {typedef typename Func::template result<Func(ArgType)>::type type;};
+
+template<typename Func, typename ArgType>
+struct result_of<Func(ArgType)> {
+    template<typename T>
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType)>::type const * = 0);
+    static has_none               testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename unary_result_of_select<Func, ArgType, FunctorType>::type type;
+};
+
+template<typename Func, typename ArgType0, typename ArgType1, int SizeOf=sizeof(has_none)>
+struct binary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1>
+struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_std_result_type)>
+{typedef typename Func::result_type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1>
+struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_tr1_result)>
+{typedef typename Func::template result<Func(ArgType0,ArgType1)>::type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1>
+struct result_of<Func(ArgType0,ArgType1)> {
+    template<typename T>
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1)>::type const * = 0);
+    static has_none               testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename binary_result_of_select<Func, ArgType0, ArgType1, FunctorType>::type type;
+};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2, int SizeOf=sizeof(has_none)>
+struct ternary_result_of_select {typedef typename internal::remove_all<ArgType0>::type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
+struct ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, sizeof(has_std_result_type)>
+{typedef typename Func::result_type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
+struct ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, sizeof(has_tr1_result)>
+{typedef typename Func::template result<Func(ArgType0,ArgType1,ArgType2)>::type type;};
+
+template<typename Func, typename ArgType0, typename ArgType1, typename ArgType2>
+struct result_of<Func(ArgType0,ArgType1,ArgType2)> {
+    template<typename T>
+    static has_std_result_type    testFunctor(T const *, typename T::result_type const * = 0);
+    template<typename T>
+    static has_tr1_result         testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1,ArgType2)>::type const * = 0);
+    static has_none               testFunctor(...);
+
+    // note that the following indirection is needed for gcc-3.3
+    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
+    typedef typename ternary_result_of_select<Func, ArgType0, ArgType1, ArgType2, FunctorType>::type type;
+};
+
+#endif
+
+#if EIGEN_HAS_STD_INVOKE_RESULT
+template<typename F, typename... ArgTypes>
+struct invoke_result {
+  typedef typename std::invoke_result<F, ArgTypes...>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#elif EIGEN_HAS_CXX11
+template<typename F, typename... ArgTypes>
+struct invoke_result {
+  typedef typename result_of<F(ArgTypes...)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#else
+template<typename F, typename ArgType0 = void, typename ArgType1 = void, typename ArgType2 = void>
+struct invoke_result {
+  typedef typename result_of<F(ArgType0, ArgType1, ArgType2)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F>
+struct invoke_result<F, void, void, void> {
+  typedef typename result_of<F()>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F, typename ArgType0>
+struct invoke_result<F, ArgType0, void, void> {
+  typedef typename result_of<F(ArgType0)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+
+template<typename F, typename ArgType0, typename ArgType1>
+struct invoke_result<F, ArgType0, ArgType1, void> {
+  typedef typename result_of<F(ArgType0, ArgType1)>::type type1;
+  typedef typename remove_all<type1>::type type;
+};
+#endif
+
+struct meta_yes { char a[1]; };
+struct meta_no  { char a[2]; };
+
+// Check whether T::ReturnType does exist
+template <typename T>
+struct has_ReturnType
+{
+  template <typename C> static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0);
+  template <typename C> static meta_no  testFunctor(...);
+
+  enum { value = sizeof(testFunctor<T>(static_cast<T*>(0))) == sizeof(meta_yes) };
+};
+
+template<typename T> const T* return_ptr();
+
+template <typename T, typename IndexType=Index>
+struct has_nullary_operator
+{
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()())>0)>::type * = 0);
+  static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
+};
+
+template <typename T, typename IndexType=Index>
+struct has_unary_operator
+{
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()(IndexType(0)))>0)>::type * = 0);
+  static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
+};
+
+template <typename T, typename IndexType=Index>
+struct has_binary_operator
+{
+  template <typename C> static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr<C>()->operator()(IndexType(0),IndexType(0)))>0)>::type * = 0);
+  static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
+};
+
+/** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
+  * Usage example: \code meta_sqrt<1023>::ret \endcode
+  */
+template<int Y,
+         int InfX = 0,
+         int SupX = ((Y==1) ? 1 : Y/2),
+         bool Done = ((SupX-InfX)<=1 ? true : ((SupX*SupX <= Y) && ((SupX+1)*(SupX+1) > Y))) >
+                                // use ?: instead of || just to shut up a stupid gcc 4.3 warning
+class meta_sqrt
+{
+    enum {
+      MidX = (InfX+SupX)/2,
+      TakeInf = MidX*MidX > Y ? 1 : 0,
+      NewInf = int(TakeInf) ? InfX : int(MidX),
+      NewSup = int(TakeInf) ? int(MidX) : SupX
+    };
+  public:
+    enum { ret = meta_sqrt<Y,NewInf,NewSup>::ret };
+};
+
+template<int Y, int InfX, int SupX>
+class meta_sqrt<Y, InfX, SupX, true> { public:  enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; };
+
+
+/** \internal Computes the least common multiple of two positive integer A and B
+  * at compile-time. It implements a naive algorithm testing all multiples of A.
+  * It thus works better if A>=B.
+  */
+template<int A, int B, int K=1, bool Done = ((A*K)%B)==0>
+struct meta_least_common_multiple
+{
+  enum { ret = meta_least_common_multiple<A,B,K+1>::ret };
+};
+template<int A, int B, int K>
+struct meta_least_common_multiple<A,B,K,true>
+{
+  enum { ret = A*K };
+};
+
+/** \internal determines whether the product of two numeric types is allowed and what the return type is */
+template<typename T, typename U> struct scalar_product_traits
+{
+  enum { Defined = 0 };
+};
+
+// FIXME quick workaround around current limitation of result_of
+// template<typename Scalar, typename ArgType0, typename ArgType1>
+// struct result_of<scalar_product_op<Scalar>(ArgType0,ArgType1)> {
+// typedef typename scalar_product_traits<typename remove_all<ArgType0>::type, typename remove_all<ArgType1>::type>::ReturnType type;
+// };
+
+/** \internal Obtains a POD type suitable to use as storage for an object of a size
+  * of at most Len bytes, aligned as specified by \c Align.
+  */
+template<unsigned Len, unsigned Align>
+struct aligned_storage {
+  struct type {
+    EIGEN_ALIGN_TO_BOUNDARY(Align) unsigned char data[Len];
+  };
+};
+
+} // end namespace internal
+
+namespace numext {
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template<typename T> EIGEN_DEVICE_FUNC   void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
+#else
+template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
+#endif
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+using internal::device::numeric_limits;
+#else
+using std::numeric_limits;
+#endif
+
+// Integer division with rounding up.
+// T is assumed to be an integer type with a>=0, and b>0
+template<typename T>
+EIGEN_DEVICE_FUNC
+T div_ceil(const T &a, const T &b)
+{
+  return (a+b-1) / b;
+}
+
+// The aim of the following functions is to bypass -Wfloat-equal warnings
+// when we really want a strict equality comparison on floating points.
+template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool equal_strict(const X& x,const Y& y) { return x == y; }
+
+#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); }
+
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool equal_strict(const double& x,const double& y) { return std::equal_to<double>()(x,y); }
+#endif
+
+template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool not_equal_strict(const X& x,const Y& y) { return x != y; }
+
+#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); }
+
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to<double>()(x,y); }
+#endif
+
+} // end namespace numext
+
+} // end namespace Eigen
+
+#endif // EIGEN_META_H
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 938547f363cfbb..9b03cd08ba97a2 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -18,10 +18,10 @@ set(FLUID_CORE_NAME "core")
 if(WITH_AVX AND AVX_FOUND)
   set(FLUID_CORE_NAME "${FLUID_CORE_NAME}_avx")
   if(NOT DEFINED NOAVX_CORE_FILE OR NOAVX_CORE_FILE STREQUAL "")
-    message(STATUS "WARNING: This is just a warning for publishing release.
+    message(STATUS "MESSAGE: This is just a message for publishing release.
       You are building AVX version without NOAVX core.
       So the wheel package may fail on NOAVX machine.
-      You can add -DFLUID_CORE_NAME=/path/to/your/core_noavx.* in cmake command
+      You can add -DNOAVX_CORE_FILE=/path/to/your/core_noavx.* in cmake command
       to get a full wheel package to resolve this warning.
       While, this version will still work on local machine.")
   endif()
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 8dabe19f57c58f..02725751cb6694 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -44,6 +44,7 @@
 import paddle.device
 import paddle.regularizer
 import paddle.incubate
+import paddle.autograd
 
 # TODO: define alias in tensor and framework directory
 
diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py
new file mode 100644
index 00000000000000..8b3f3086a4a728
--- /dev/null
+++ b/python/paddle/autograd/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
+
+from . import backward_mode
+from .backward_mode import backward
+
+__all__ = ['grad']
+
+__all__ += backward_mode.__all__
diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py
new file mode 100644
index 00000000000000..96e4336abaa6fa
--- /dev/null
+++ b/python/paddle/autograd/backward_mode.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid import core
+from paddle.fluid import framework
+import paddle
+__all__ = ['backward']
+
+
+@framework.dygraph_only
+def backward(tensors, grad_tensors=None, retain_graph=False):
+    """
+    Compute the backward gradients of given tensors.
+    
+    Args:
+        tensors(list of Tensors): the tensors which the gradient to be computed. The tensors can not contain the same tensor.
+
+        grad_tensors(list of Tensors of None, optional): the init gradients of the `tensors`` .If not None, it must have the same length with ``tensors`` ,
+            and if any of the elements is None, then the init gradient is the default value which is filled with 1.0. 
+            If None, all the gradients of the ``tensors`` is the default value which is filled with 1.0.
+            Defaults to None.
+
+        retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
+            like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
+            :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
+            Defaults to False.
+    
+    Returns:
+        NoneType: None
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            x = paddle.to_tensor([[1, 2], [3, 4]], dtype='float32', stop_gradient=False)
+            y = paddle.to_tensor([[3, 2], [3, 4]], dtype='float32')
+
+            grad_tensor1 = paddle.to_tensor([[1,2], [2, 3]], dtype='float32')
+            grad_tensor2 = paddle.to_tensor([[1,1], [1, 1]], dtype='float32')
+
+            z1 = paddle.matmul(x, y)
+            z2 = paddle.matmul(x, y)
+
+            paddle.autograd.backward([z1, z2], [grad_tensor1, grad_tensor2], True)
+            print(x.grad)
+            #[[12. 18.]
+            # [17. 25.]]
+
+            x.clear_grad()
+
+            paddle.autograd.backward([z1, z2], [grad_tensor1, None], True)
+            print(x.grad)
+            #[[12. 18.]
+            # [17. 25.]]
+
+            x.clear_grad()
+
+            paddle.autograd.backward([z1, z2])
+            print(x.grad)
+            #[[10. 14.]
+            # [10. 14.]]
+
+    """
+
+    def check_tensors(in_out_list, name):
+        assert in_out_list is not None, "{} should not be None".format(name)
+
+        if isinstance(in_out_list, (list, tuple)):
+            assert len(in_out_list) > 0, "{} connot be empyt".format(name)
+            for each_var in in_out_list:
+                assert isinstance(
+                    each_var, paddle.
+                    Tensor), "Elements of {} must be paddle.Tensor".format(name)
+            return in_out_list
+        else:
+            assert isinstance(
+                in_out_list,
+                paddle.Tensor), "{} must be Tensor or list of Tensor".format(
+                    name)
+            return [in_out_list]
+
+    tensors = check_tensors(tensors, "tensors")
+
+    assert len(tensors) == len(
+        set(tensors)
+    ), "The argument 'tensors' of paddle.autograd.backward contains duplicate paddle.Tensor object."
+
+    if grad_tensors is not None:
+        if not isinstance(grad_tensors, (list, tuple)):
+            grad_tensors = [grad_tensors]
+
+        for each_tensor in grad_tensors:
+            if each_tensor is not None:
+                assert isinstance(
+                    each_tensor, paddle.Tensor
+                ), "The argument 'grad_tensors' of paddle.autograd.backward is invalid, it can be 'None', 'paddle.Tensor' or 'list[None/paddle.Tensor]'."
+    else:
+        grad_tensors = [None] * len(tensors)
+
+    if len(grad_tensors) > 0:
+        assert len(tensors) == len(
+            grad_tensors), "The length of grad_tensors must be equal to tensors"
+
+    assert isinstance(retain_graph, bool), "retain_graph must be True or False"
+
+    core.dygraph_run_backward(tensors, grad_tensors, retain_graph,
+                              framework._dygraph_tracer())
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index a6eb896802f112..2756dea72e84a9 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -26,6 +26,9 @@
 import paddle.fluid.core as core
 
 __all__ = [
+    'wait',
+    'new_group',
+    'get_group',
     'broadcast',
     'all_reduce',
     'reduce',
@@ -75,30 +78,220 @@ class ReduceOp:
     PROD = 3
 
 
-class _Group():
-    """The abstract representation of group."""
+class Group():
+    """
+    The abstract representation of group.
+    """
 
-    def __init__(self, rank, rank_num):
+    def __init__(self, rank, rank_num, id=0, ranks=[]):
         self.rank = rank
         self.nranks = rank_num
+        self.id = id
+        self.ranks = ranks
+
+    def is_member(self):
+        if self.rank < 0:
+            return False
+        if self.nranks < 2:
+            return False
+        return True
+
+    def get_group_rank(self, rank):
+        if self.id == 0:
+            return rank
+        if self.is_member() and rank in self.ranks:
+            return self.ranks.index(rank)
+        else:
+            return -1
+
+
+_global_env = None
+
+
+def _get_global_env():
+    global _global_env
+    if not _global_env:
+        _global_env = paddle.distributed.ParallelEnv()
+    return _global_env
+
+
+# group map : the map of all group, 0 for GlobalGroup
+# Dict[int, Group]
+_group_map = {}
+
+
+def _get_group_map():
+    global _group_map
+    if not _group_map:
+        genv = _get_global_env()
+        _group_map[0] = Group(genv.rank, genv.world_size, 0)
+    return _group_map
+
+
+def _get_global_group():
+    return _get_group_map()[0]
+
+
+def _new_ring_id():
+    return len(_get_group_map()) + max(_get_global_env().nrings, 9)
+
+
+def get_group(id=0):
+    """
+
+    Get group instance by group id.
+
+    Args:
+        id (int): the group id. Default value is 0.
+
+    Returns:
+        Group: the group instance.
+
+    Examples:
+        .. code-block:: python
+
+            ...
+            gid = paddle.distributed.new_group([2,4,6])
+            paddle.distributed.get_group(gid.id)
+
+    """
+
+    gm = _get_group_map()
+    return gm[group] if group in gm else None
+
+
+def new_group(ranks=None, backend=None):
+    """
+
+    Creates a new distributed communication group.
+
+    Args:
+        ranks (list): The global ranks of group members.
+        backend (str): The backend used to create group, only nccl is supported now.
+
+    Returns:
+        Group: The group instance.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            paddle.distributed.init_parallel_env()
+            tindata = paddle.randn(shape=[2, 3])
+            gp = paddle.distributed.new_group([2,4,6])
+            paddle.distributed.all_reduce(tindata, group=gp, use_calc_stream=False)
+
+    """
+
+    if not backend:
+        backend = 'nccl'
+    assert backend == 'nccl', ("backend other than nccl is not supported yet")
+
+    genv = _get_global_env()
+    global_rank = genv.rank
+
+    ring_id = _new_ring_id()
+
+    global _group_map
+    if global_rank not in ranks:
+        gp = Group(-1, -1, ring_id, ranks)
+        _group_map[ring_id] = gp
+        return gp
+
+    ranks = sorted(ranks)
+    group_rank = ranks.index(global_rank)
+    group_size = len(ranks)
+    gp = Group(group_rank, group_size, ring_id, ranks)
+    _group_map[ring_id] = gp
+
+    if group_size < 2:
+        return gp
+
+    strategy = core.ParallelStrategy()
+    strategy.nranks = group_size
+    strategy.local_rank = group_rank
+    strategy.trainer_endpoints = [genv.trainer_endpoints[i] for i in ranks]
+    strategy.current_endpoint = genv.current_endpoint
+    strategy.nrings = 1
+
+    if core.is_compiled_with_cuda():
+        place = core.CUDAPlace(genv.device_id)
+        core.NCCLParallelContext(strategy, place).init_with_ring_id(ring_id)
+    else:
+        assert False, ("no cuda device found")
+
+    return gp
+
 
+def wait(tensor, group=None, use_calc_stream=True):
+    """
+
+    wait to sync stream for group.
 
-# NOTE(chenweihang): Lazily initialized global group information
-# If we initialize _default_group when import module, it will 
-# not update when we use spawn to run multi-process training 
-_default_group = None
+    Args:
+        tensor (Tensor): The Tensor used before sync.
+        group (Group): The Group instance to perform sync.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
+    Returns:
+        None.
 
-def _get_global_default_group():
-    global _default_group
-    if _default_group is None:
-        _default_group = _Group(
-            int(os.getenv("PADDLE_TRAINER_ID", "0")),
-            int(os.getenv("PADDLE_TRAINERS_NUM", "1")))
-    return _default_group
+    Examples:
+        .. code-block:: python
 
+            import paddle
+
+            paddle.distributed.init_parallel_env()
+            tindata = paddle.randn(shape=[2, 3])
+            paddle.distributed.all_reduce(tindata, use_calc_stream=True)
+            paddle.distributed.wait(tindata)
 
-def broadcast(tensor, src, group=0):
+    """
+
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
+
+    if use_calc_stream:
+        _sync_calc_stream(tensor)
+    else:
+        _sync_comm_stream(tensor, ring_id)
+
+
+def _sync_calc_stream(tensor):
+
+    if in_dygraph_mode():
+        return core.ops.c_sync_calc_stream(tensor, tensor)
+
+    op_type = 'c_sync_calc_stream'
+
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]}, )
+
+
+def _sync_comm_stream(tensor, ring_id=0):
+
+    if in_dygraph_mode():
+        return core.ops.c_sync_comm_stream([tensor], [tensor], 'ring_id',
+                                           ring_id)
+
+    op_type = 'c_sync_comm_stream'
+
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={'ring_id': ring_id}, )
+
+
+def broadcast(tensor, src, group=None, use_calc_stream=True):
     """
 
     Broadcast a tensor from the source to all others.
@@ -107,7 +300,9 @@ def broadcast(tensor, src, group=0):
         tensor (Tensor): The Tensor to send if current rank is the source, or the tensor to receive otherwise. Its data type
             should be float16, float32, float64, int32 or int64.
         src (int): The source rank.
-        group (int): The process group to work on. It is Optional.
+        group (Group): The group instance return by new_group or None for global default group.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
     Returns:
         None.
@@ -130,17 +325,26 @@ def broadcast(tensor, src, group=0):
             out = data.numpy()
             # [[1, 2, 3], [1, 2, 3]]
     """
+
+    if group is not None and not group.is_member():
+        return
+
+    if not isinstance(src, int):
+        raise ValueError("src should be int.")
+
+    ring_id = 0 if group is None else group.id
+    gsrc = src if group is None else group.get_group_rank(src)
+    assert gsrc >= 0, ("src rank out of group, need global rank")
+
     if in_dygraph_mode():
-        return core.ops.c_broadcast(tensor, tensor, 'root', src,
-                                    'use_calc_stream', True, 'ring_id', group)
+        return core.ops.c_broadcast(tensor, tensor, 'root', gsrc,
+                                    'use_calc_stream', use_calc_stream,
+                                    'ring_id', ring_id)
 
     op_type = 'c_broadcast'
     check_variable_and_dtype(
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
         'broadcast')
-    if not isinstance(src, int) or not isinstance(group, int):
-        raise ValueError("Both the type of 'src' and 'group' for broadcast "
-                         "should be int.")
 
     helper = LayerHelper(op_type, **locals())
     helper.append_op(
@@ -148,13 +352,13 @@ def broadcast(tensor, src, group=0):
         inputs={'X': [tensor]},
         outputs={'Out': [tensor]},
         attrs={
-            'root': src,
-            'use_calc_stream': True,
-            'ring_id': group,
+            'root': gsrc,
+            'use_calc_stream': use_calc_stream,
+            'ring_id': ring_id,
         })
 
 
-def all_reduce(tensor, op=ReduceOp.SUM, group=0):
+def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True):
     """
 
     Reduce a tensor over all ranks so that all get the result.
@@ -162,8 +366,10 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0):
     Args:
         tensor (Tensor): The input Tensor. It also works as the output Tensor. Its data type
             should be float16, float32, float64, int32 or int64.
-        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used.
-        group (int): Optional. The process group to work on.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used. Default value is ReduceOp.SUM.
+        group (Group): The group instance return by new_group or None for global default group.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
     Returns:
         None.
@@ -187,19 +393,25 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0):
             out = data.numpy()
             # [[5, 7, 9], [5, 7, 9]]
     """
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
+
     if in_dygraph_mode():
         if op == ReduceOp.SUM:
             return core.ops.c_allreduce_sum(tensor, tensor, 'use_calc_stream',
-                                            True, 'ring_id', group)
+                                            use_calc_stream, 'ring_id', ring_id)
         elif op == ReduceOp.MAX:
             return core.ops.c_allreduce_max(tensor, tensor, 'use_calc_stream',
-                                            True, 'ring_id', group)
+                                            use_calc_stream, 'ring_id', ring_id)
         elif op == ReduceOp.MIN:
             return core.ops.c_allreduce_min(tensor, tensor, 'use_calc_stream',
-                                            True, 'ring_id', group)
+                                            use_calc_stream, 'ring_id', ring_id)
         elif op == ReduceOp.PROD:
             return core.ops.c_allreduce_prod(tensor, tensor, 'use_calc_stream',
-                                             True, 'ring_id', group)
+                                             use_calc_stream, 'ring_id',
+                                             ring_id)
         else:
             raise ValueError("Unknown parameter: {}.".format(op))
 
@@ -217,18 +429,18 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0):
         op_type = 'c_allreduce_min'
     elif op == ReduceOp.PROD:
         op_type = 'c_allreduce_prod'
-    if not isinstance(group, int):
-        raise ValueError("The type of 'group' for all_reduce should be int.")
+    if not isinstance(ring_id, int):
+        raise ValueError("The type of 'ring_id' for all_reduce should be int.")
     helper = LayerHelper(op_type, **locals())
     helper.append_op(
         type=op_type,
         inputs={'X': [tensor]},
         outputs={'Out': [tensor]},
-        attrs={'ring_id': group,
-               'use_calc_stream': True})
+        attrs={'ring_id': ring_id,
+               'use_calc_stream': use_calc_stream})
 
 
-def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
+def reduce(tensor, dst, op=ReduceOp.SUM, group=None, use_calc_stream=True):
     """
 
     Reduce a tensor to the destination from all others.
@@ -237,8 +449,10 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
         tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type
             should be float16, float32, float64, int32 or int64.
         dst (int): The destination rank id.
-        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used.
-        group (int): The id of the process group to work on.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used. Default value is ReduceOp.SUM.
+        group (Group): The group instance return by new_group or None for global default group.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
     Returns:
         None.
@@ -261,20 +475,33 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
             out = data.numpy()
             # [[5, 7, 9], [5, 7, 9]]
     """
+    if group is not None and not group.is_member():
+        return
+
+    if not isinstance(dst, int):
+        raise ValueError("dst should be int.")
+
+    ring_id = 0 if group is None else group.id
+    gdst = dst if group is None else group.get_group_rank(dst)
+    assert gdst >= 0, ("dst rank out of group, need global rank")
+
     if in_dygraph_mode():
         if op == ReduceOp.SUM:
             return core.ops.c_reduce_sum(tensor, tensor, 'use_calc_stream',
-                                         True, 'ring_id', group, 'root_id', dst)
+                                         use_calc_stream, 'ring_id', ring_id,
+                                         'root_id', gdst)
         elif op == ReduceOp.MAX:
             return core.ops.c_reduce_max(tensor, tensor, 'use_calc_stream',
-                                         True, 'ring_id', group, 'root_id', dst)
+                                         use_calc_stream, 'ring_id', ring_id,
+                                         'root_id', gdst)
         elif op == ReduceOp.MIN:
             return core.ops.c_reduce_min(tensor, tensor, 'use_calc_stream',
-                                         True, 'ring_id', group, 'root_id', dst)
+                                         use_calc_stream, 'ring_id', ring_id,
+                                         'root_id', gdst)
         elif op == ReduceOp.PROD:
             return core.ops.c_reduce_prod(tensor, tensor, 'use_calc_stream',
-                                          True, 'ring_id', group, 'root_id',
-                                          dst)
+                                          use_calc_stream, 'ring_id', ring_id,
+                                          'root_id', gdst)
         else:
             raise ValueError("Unknown parameter: {}.".format(op))
 
@@ -295,22 +522,19 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0):
     elif op == ReduceOp.PROD:
         op_type = 'c_reduce_prod'
 
-    if not isinstance(dst, int) or not isinstance(group, int):
-        raise ValueError("Both the type of 'dst' and 'group' for reduce "
-                         "should be int.")
     helper = LayerHelper(op_type, **locals())
     helper.append_op(
         type=op_type,
         inputs={'X': [tensor]},
         outputs={'Out': [tensor]},
         attrs={
-            'ring_id': group,
-            'use_calc_stream': True,
-            'root_id': dst,
+            'ring_id': ring_id,
+            'use_calc_stream': use_calc_stream,
+            'root_id': gdst,
         })
 
 
-def all_gather(tensor_list, tensor, group=0):
+def all_gather(tensor_list, tensor, group=None, use_calc_stream=True):
     """
 
     Gather tensors from all participators and all get the result.
@@ -320,7 +544,9 @@ def all_gather(tensor_list, tensor, group=0):
             should be float16, float32, float64, int32 or int64.
         tensor (Tensor): The Tensor to send. Its data type
             should be float16, float32, float64, int32 or int64.
-        group (int): The id of the process group to work on.
+        group (Group): The group instance return by new_group or None for global default group.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
     Returns:
         None.
@@ -348,13 +574,19 @@ def all_gather(tensor_list, tensor, group=0):
                 data2 = paddle.to_tensor(np_data2)
                 paddle.distributed.all_gather(tensor_list, data2)
     """
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
+    nranks = _get_global_group().nranks if group is None else group.nranks
+
     op_type = 'c_allgather'
     helper = LayerHelper(op_type, **locals())
     out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
-    _default_group = _get_global_default_group()
+
     if in_dygraph_mode():
-        core.ops.c_allgather(tensor, out, 'use_calc_stream', True, 'ring_id',
-                             group, 'nranks', _default_group.nranks)
+        core.ops.c_allgather(tensor, out, 'use_calc_stream', use_calc_stream,
+                             'ring_id', ring_id, 'nranks', nranks)
     else:
         if not isinstance(tensor_list, list):
             raise ValueError("The type of 'tensor_list' for all_gather "
@@ -367,23 +599,20 @@ def all_gather(tensor_list, tensor, group=0):
         check_variable_and_dtype(
             tensor, 'tensor',
             ['float16', 'float32', 'float64', 'int32', 'int64'], 'all_gather')
-        if not isinstance(group, int):
-            raise ValueError("The type of 'group' for all_gather "
-                             "should be int.")
         helper.append_op(
             type=op_type,
             inputs={'X': [tensor]},
             outputs={'Out': [out]},
             attrs={
-                'ring_id': group,
-                'use_calc_stream': True,
-                'nranks': _default_group.nranks
+                'ring_id': ring_id,
+                'use_calc_stream': use_calc_stream,
+                'nranks': nranks
             })
 
-    tensor_list.extend(paddle.split(out, _default_group.nranks, 0))
+    tensor_list.extend(paddle.split(out, nranks, 0))
 
 
-def scatter(tensor, tensor_list=None, src=0, group=0):
+def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
     """
 
     Scatter a tensor to all participators.
@@ -392,9 +621,11 @@ def scatter(tensor, tensor_list=None, src=0, group=0):
         tensor (Tensor): The output Tensor. Its data type
             should be float16, float32, float64, int32 or int64.
         tensor_list (list): A list of Tensors to scatter. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32 or int64.
-        src (int): The source rank id.
-        group (int): The id of the process group to work on.
+            should be float16, float32, float64, int32 or int64. Default value is None.
+        src (int): The source rank id. Default value is 0.
+        group (Group): The group instance return by new_group or None for global default group.
+        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
+            Default to True.
 
     Returns:
         None.
@@ -422,45 +653,52 @@ def scatter(tensor, tensor_list=None, src=0, group=0):
                 paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1)
             out = data1.numpy()
     """
+    if group is not None and not group.is_member():
+        return
+
+    if not isinstance(src, int):
+        raise ValueError("src should be int.")
+
+    ring_id = 0 if group is None else group.id
+    gsrc = src if group is None else group.get_group_rank(src)
+    assert gsrc >= 0, ("src rank out of group, need global rank")
+    rank = _get_global_group().rank if group is None else group.rank
+    nranks = _get_global_group().nranks if group is None else group.nranks
+
     op_type = 'c_scatter'
-    _default_group = _get_global_default_group()
-    rank = _default_group.rank
-    nranks = _default_group.nranks
-    if rank != src:
+
+    if rank != gsrc:
         tensor_list = []
         for _ in range(nranks):
             tensor_list.append(tensor)
     temp = paddle.concat(tensor_list, axis=0)
     if in_dygraph_mode():
-        return core.ops.c_scatter(temp, tensor, 'use_calc_stream', True,
-                                  'ring_id', group, 'nranks',
-                                  _default_group.nranks, 'root', src)
+        return core.ops.c_scatter(temp, tensor, 'use_calc_stream',
+                                  use_calc_stream, 'ring_id', ring_id, 'nranks',
+                                  nranks, 'root', gsrc)
     check_variable_and_dtype(
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
         'scatter')
-    if not isinstance(group, int) or not isinstance(src, int):
-        raise ValueError("Both the type of 'src' and 'group' for scatter "
-                         "should be int.")
     helper = LayerHelper(op_type, **locals())
     helper.append_op(
         type=op_type,
         inputs={'X': [temp]},
         outputs={'Out': [tensor]},
         attrs={
-            'ring_id': group,
-            'root': src,
-            'use_calc_stream': True,
+            'ring_id': ring_id,
+            'root': gsrc,
+            'use_calc_stream': use_calc_stream,
             'nranks': nranks,
         })
 
 
-def barrier(group=0):
+def barrier(group=None):
     """
 
     Barrier among all participators in the group.
 
     Args:
-        group (int): The id of the process group to work on.
+        group (Group): The group instance return by new_group or None for global default group.
 
     Returns:
         None.
@@ -475,18 +713,23 @@ def barrier(group=0):
             init_parallel_env()
             paddle.distributed.barrier()
     """
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
+
     op_type = 'barrier'
     temp = fill_constant([1], dtype="int32", value="1")
     if in_dygraph_mode():
-        return core.ops.barrier(temp, temp, 'ring_id', group)
-    if not isinstance(group, int):
+        return core.ops.barrier(temp, temp, 'ring_id', ring_id)
+    if not isinstance(ring_id, int):
         raise ValueError("The type of 'group' for barrier must be int.")
     helper = LayerHelper(op_type, **locals())
     helper.append_op(
         type=op_type,
         inputs={'X': [temp]},
         outputs={'Out': [temp]},
-        attrs={'ring_id': group})
+        attrs={'ring_id': ring_id})
 
 
 def _parallel_linear(x, num_rows, num_cols, axis, param_attr, bias_attr,
@@ -515,10 +758,10 @@ def _parallel_linear(x, num_rows, num_cols, axis, param_attr, bias_attr,
 
     if gather_out:
         if axis == 0:
-            paddle.distributed.all_reduce(linear_out, group=0)
+            paddle.distributed.all_reduce(linear_out)
         else:
             output = []
-            paddle.distributed.all_gather(output, linear_out, group=0)
+            paddle.distributed.all_gather(output, linear_out)
             linear_out = paddle.concat(output, axis=len(linear_out.shape) - 1)
     return linear_out
 
@@ -559,7 +802,7 @@ def _parallel_embedding(x, per_part_embeddings, origin_size, param_attr,
     main_block = paddle.static.default_main_program().global_block()
     startup_block.vars[embedding.weight.name].is_distributed = True
     main_block.vars[embedding.weight.name].is_distributed = True
-    paddle.distributed.all_reduce(emb_out, group=0)
+    paddle.distributed.all_reduce(emb_out, group=None)
     return emb_out
 
 
@@ -584,7 +827,7 @@ def split(x,
         With parallel embedding, the weight is split into num_partitions partitions, each
         of which is a matrix with (N/num_partitions + 1) rows and M column where the last
         row as the padding idx.
-        
+
         Suppose we split the NxM weight into two partitons on device_0 and device_1
         respectively. Then, one each device, the final weight has (N/2 + 1) rows with the
         index range from 0 to N/2. On device_0, all values in the input within [0, N/2 -1]
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index bd8492ecfa7ee7..6d4aedddba6747 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -20,16 +20,13 @@
 from .dataset import *
 from .data_generator import MultiSlotDataGenerator, MultiSlotStringDataGenerator
 from . import metrics
+from .base.topology import CommunicateTopology, HybridCommunicateGroup
 
 __all__ = [
-    "DistributedStrategy",
-    "UtilBase",
-    "UserDefinedRoleMaker",
-    "PaddleCloudRoleMaker",
-    "Fleet",
-    "MultiSlotDataGenerator",
-    "MultiSlotStringDataGenerator",
-    "Role",
+    "DistributedStrategy", "UtilBase", "UserDefinedRoleMaker",
+    "PaddleCloudRoleMaker", "Fleet", "MultiSlotDataGenerator",
+    "MultiSlotStringDataGenerator", "Role", "CommunicateTopology",
+    "HybridCommunicateGroup"
 ]
 
 fleet = Fleet()
@@ -40,6 +37,17 @@
 is_first_worker = fleet.is_first_worker
 worker_index = fleet.worker_index
 worker_num = fleet.worker_num
+node_num = fleet.node_num
+rank = fleet.worker_index
+nranks = fleet.worker_num
+world_size = fleet.worker_num
+# device id in current trainer
+local_device_ids = fleet.local_device_ids
+# device ids in world
+world_device_ids = fleet.world_device_ids
+# rank in node
+local_rank = fleet.local_rank
+rank_in_node = local_rank
 is_worker = fleet.is_worker
 worker_endpoints = fleet.worker_endpoints
 server_num = fleet.server_num
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index f79013d7347c00..626f6a37a982e0 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -620,6 +620,34 @@ def last_comm_group_size_MB(self, value):
         else:
             raise ValueError("last_comm_group_size_MB should be greater than 0")
 
+    @property
+    def find_unused_parameters(self):
+        """
+        Indicating whether we are using find_unused_parameters to 
+        find unused parameters in DataParallel.
+
+        Default value: True
+
+        Examples:
+
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.find_unused_parameters = True
+        """
+
+        return self.strategy.find_unused_parameters
+
+    @find_unused_parameters.setter
+    @is_strict_auto
+    def find_unused_parameters(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.find_unused_parameters = flag
+        else:
+            print(
+                "WARNING: find_unused_parameters should have value of bool type")
+
     @property
     def _fuse_grad_size_in_TFLOPS(self):
         return self.strategy.fuse_grad_size_in_TFLOPS
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 19ba637cc96809..0a60cbf78d5236 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -289,6 +289,18 @@ def worker_num(self):
         """
         return self._role_maker._worker_num()
 
+    def node_num(self):
+        return self._role_maker._get_node_num()
+
+    def local_rank(self):
+        return self._role_maker._get_local_rank()
+
+    def local_device_ids(self):
+        return self._role_maker._get_local_device_ids()
+
+    def world_device_ids(self):
+        return self._role_maker._get_world_device_ids()
+
     def is_worker(self):
         """
         Check whether the node is an instance of worker.
@@ -628,12 +640,13 @@ def distributed_optimizer(self, optimizer, strategy=None):
         self.user_defined_optimizer = optimizer
 
         if strategy is not None:
-            warnings.warn(
-                "It is recommended to use DistributedStrategy "
-                "in fleet.init(). The strategy here is only for compatibility. "
-                "If the strategy in fleet.distributed_optimizer() is "
-                "not None, then it will overwrite the DistributedStrategy in fleet.init(), "
-                "which will take effect in distributed training.")
+            if self._is_collective:
+                warnings.warn(
+                    "It is recommended to use DistributedStrategy "
+                    "in fleet.init(). The strategy here is only for compatibility. "
+                    "If the strategy in fleet.distributed_optimizer() is "
+                    "not None, then it will overwrite the DistributedStrategy in fleet.init(), "
+                    "which will take effect in distributed training.")
             self._user_defined_strategy = copy.deepcopy(strategy)
 
         self._context = {}
@@ -705,7 +718,9 @@ def forward(self, x):
             model,
             comm_buffer_size=self._user_defined_strategy.fuse_grad_size_in_MB,
             last_comm_buffer_size=self._user_defined_strategy.
-            last_comm_group_size_MB)
+            last_comm_group_size_MB,
+            find_unused_parameters=self._user_defined_strategy.
+            find_unused_parameters)
         return self.model
 
     @dygraph_only
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index a8683aea97fff4..62c8faa0757c66 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -622,6 +622,29 @@ def _node_num(self):
             self._generate_role()
         return self._nodes_num
 
+    def _get_node_num(self):
+        """
+        return the training node number
+        """
+        if not self._role_is_generated:
+            self._generate_role()
+        return self._nodes_num
+
+    def _get_local_rank(self):
+        if not self._role_is_generated:
+            self._generate_role()
+        return self._local_rank
+
+    def _get_local_device_ids(self):
+        if not self._role_is_generated:
+            self._generate_role()
+        return self._local_device_ids
+
+    def _get_world_device_ids(self):
+        if not self._role_is_generated:
+            self._generate_role()
+        return self._world_device_ids
+
     def _get_trainer_endpoints(self):
         """
         get endpoint of all trainers
@@ -782,6 +805,9 @@ def _collective_env(self):
         self._trainers_num = len(self._worker_endpoints)
         self._nodes_num = len(
             set([x.split(':')[0] for x in self._worker_endpoints]))
+        self._local_rank = os.getenv("PADDLE_RANK_IN_NODE")
+        self._local_device_ids = os.getenv("PADDLE_LOCAL_DEVICE_IDS")
+        self._world_device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS")
 
     def _gloo_init(self):
         # PADDLE_WITH_GLOO 1: trainer barrier, 2: all barrier
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
new file mode 100644
index 00000000000000..4e20ad50611399
--- /dev/null
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -0,0 +1,176 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import collections
+import numpy as np
+from itertools import product
+from functools import reduce
+__all__ = ['CommunicateTopology', 'HybridCommunicateGroup']
+
+
+class CommunicateTopology(object):
+    def __init__(self, hybrid_group_names, dims):
+        self._parallel_names = hybrid_group_names
+        self._dims = dims
+        self.coordinate = collections.namedtuple('Coordinate',
+                                                 self._parallel_names)
+        self._world_size = reduce(lambda x, y: x * y, self._dims)
+
+        ranges = [range(d) for d in self._dims]
+        all_coordinate = [self.coordinate(*x) for x in product(*ranges)]
+
+        self._coord2rank = dict(zip(all_coordinate, range(len(all_coordinate))))
+        self._rank2coord = dict(
+            zip(self._coord2rank.values(), self._coord2rank.keys()))
+
+    def get_hybrid_group_names(self):
+        return self._parallel_names
+
+    def get_dim(self, axis_name):
+        return self._dims[self._parallel_names.index(axis_name)]
+
+    def world_size(self):
+        return self._world_size
+
+    def get_rank(self, **args):
+        assert len(args) == len(self._dims)
+        key = self.coordinate(**args)
+        assert key in self._coord2rank.keys()
+        return self._coord2rank[key]
+
+    def get_coord(self, rank):
+        assert rank < self._world_size
+        assert rank in self._rank2coord.keys()
+        return self._rank2coord[rank]
+
+    def get_axis_list(self, axis_name, index):
+        axis = self._parallel_names.index(axis_name)
+        ranks = [
+            self._coord2rank[coord] for coord in self._coord2rank.keys()
+            if coord[axis] == index
+        ]
+        ranks.sort()
+        return ranks
+
+    def get_dim_size(self, axis_name):
+        assert axis_name in self._parallel_names
+        return self._dims[self._parallel_names.index(axis_name)]
+
+    def get_comm_list(self, axis_name):
+        assert axis_name in self._parallel_names
+        other_axis_names = [
+            name for name in self._parallel_names if name != axis_name
+        ]
+
+        ranges = []
+        for name in other_axis_names:
+            dim_num = self.get_dim_size(name)
+            ranges.append(range(dim_num))
+
+        all_result = []
+        for x in product(*ranges):
+            key_coord = {}
+            for other_name in other_axis_names:
+                key_coord[other_name] = x[other_axis_names.index(other_name)]
+
+            result = []
+            for i in range(0, self.get_dim_size(axis_name)):
+                key_coord[axis_name] = i
+                result.append(self._coord2rank[self.coordinate(**key_coord)])
+            all_result.append(result)
+
+        return all_result
+
+
+class HybridCommunicateGroup(object):
+    def __init__(self, topology):
+        self.nranks = paddle.distributed.get_world_size()
+        self.global_rank = paddle.distributed.get_rank()
+        self._topo = topology
+
+        self._num_data_parallel = self._topo.get_dim('data')
+        self._num_model_parallel = self._topo.get_dim('model')
+        self._num_pipe_parallel = self._topo.get_dim('pipe')
+
+        self._data_parallel_id = self._get_data_parallel_id()
+        self._model_parallel_id = self._get_model_parallel_id()
+
+        assert self._check_vaild_topo(
+        ), "Here is an unreasonable topogy setting"
+
+        # create comm group for data parallel
+        self._dp_group, self._dp_comm_group = self._set_comm_group("data")
+        print("data parallel group", self._dp_group)
+
+        # create comm group for model parallel
+        self._mp_group, self._mp_comm_group = self._set_comm_group("model")
+        print("model parallel group", self._mp_group)
+
+    def _check_vaild_topo(self):
+        return self._num_data_parallel * self._num_model_parallel * self._num_pipe_parallel == self.nranks
+
+    def _set_comm_group(self, parallel_method="data"):
+        parallel_group = []
+        parallel_comm_group = None
+        parallel_groups = self._topo.get_comm_list(parallel_method)
+
+        for group in parallel_groups:
+            comm_group = paddle.distributed.new_group(ranks=group)
+            if self.global_rank in group:
+                parallel_group = group
+                parallel_comm_group = comm_group
+
+        assert len(parallel_group) > 0
+        assert parallel_comm_group is not None
+
+        return parallel_group, parallel_comm_group
+
+    def topology(self):
+        return self._topo
+
+    def get_global_rank(self):
+        return self.global_rank
+
+    # data parallel message:
+    def _get_data_parallel_id(self):
+        return self._topo.get_coord(self.global_rank).data
+
+    def get_data_parallel_rank(self):
+        return self._data_parallel_id
+
+    def get_data_parallel_world_size(self):
+        return self._num_data_parallel
+
+    def get_data_parallel_group(self):
+        return self._dp_comm_group
+
+    def get_data_parallel_group_src_rank(self):
+        return self._dp_comm_group.ranks[0]
+
+    # model parallel message:
+    def _get_model_parallel_id(self):
+        return self._topo.get_coord(self.global_rank).model
+
+    def get_model_parallel_rank(self):
+        return self._model_parallel_id
+
+    def get_model_parallel_world_size(self):
+        return self._num_model_parallel
+
+    def get_model_parallel_group(self):
+        return self._mp_comm_group
+
+    def get_model_parallel_group_src_rank(self):
+        return self._mp_comm_group.ranks[0]
diff --git a/python/paddle/distributed/fleet/data_generator/test_data_generator.py b/python/paddle/distributed/fleet/data_generator/test_data_generator.py
deleted file mode 100644
index 60cbaf0bd36435..00000000000000
--- a/python/paddle/distributed/fleet/data_generator/test_data_generator.py
+++ /dev/null
@@ -1,39 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-import paddle
-import paddle.distributed.fleet as fleet
-
-
-class SyntheticData(fleet.MultiSlotDataGenerator):
-    def generate_sample(self, line):
-        def data_iter():
-            for i in range(10000):
-                yield ("words", [1, 2, 3, 4]), ("label", [0])
-
-        return data_iter
-
-
-class SyntheticStringData(fleet.MultiSlotStringDataGenerator):
-    def generate_sample(self, line):
-        def data_iter():
-            for i in range(10000):
-                yield [("words", ["1", "2", "3", "4"]), ("label", ["0"])]
-
-        return data_iter
-
-
-sd = SyntheticData()
-sd.run_from_memory()
-
-sd2 = SyntheticStringData()
-sd2.run_from_memory()
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 0f9b13d8a1271f..d6f4227a92380a 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -108,6 +108,21 @@ def _parse_args():
         "In gpu training, it should be less or equal to the gpus number of you system(or you set by --gpus). And so each process can"
         " bound to one or average number of gpus.")
 
+    base_group.add_argument(
+        "--run_mode",
+        type=str,
+        default="collective",
+        help="run mode of job, can be:collective/ps/ps-heter")
+
+    base_group.add_argument(
+        "--ascend_npus",
+        type=str,
+        default=None,
+        help="It's for ascend npu training."
+        "For example:"
+        "--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one gpu."
+    )
+
     if fluid.core.is_compiled_with_cuda():
         base_group.add_argument(
             "--gpus",
@@ -243,6 +258,9 @@ def launch_collective(args):
         log_dir=args.log_dir,
         envs=global_envs)
 
+    for idx, proc in enumerate(procs):
+        print("launch proc_id:{} idx:{}".format(proc.proc.pid, idx))
+
     while True:
         alive = watch_local_trainers(procs, cluster.trainers_nranks())
 
@@ -276,6 +294,16 @@ def launch_ps(args, distribute_mode):
 
 
 def which_distributed_mode(args):
+    if args.run_mode is not None:
+        assert args.run_mode in ["collective", "ps", "ps-heter"]
+
+    if args.run_mode == "collective":
+        return DistributeMode.COLLECTIVE
+    elif args.run_mode == "ps":
+        return DistributeMode.PS
+    elif args.run_mode == "ps-heter":
+        return DistributeMode.PS_HETER
+
     ps_args = [
         '--worker_num', '--server_num', '--heter_worker_num', '--servers',
         '--workers', '--heter_workers', '--http_port'
@@ -298,24 +326,26 @@ def which_distributed_mode(args):
         )
 
     if fluid.core.is_compiled_with_cuda():
-        device_count = fluid.core.get_cuda_device_count()
+        accelerators = fluid.core.get_cuda_device_count()
+    elif fluid.core.is_compiled_with_ascend():
+        accelerators = fluid.core.NPUDevice.get_device_count()
     elif fluid.core.is_compiled_with_xpu():
-        device_count = fluid.core.get_xpu_device_count()
+        accelerators = fluid.core.get_xpu_device_count()
     else:
-        device_count = 0
+        accelerators = 0
 
     if len(has_ps_args) > 0:
         logger.info(
-            "Run parameter-sever mode. pserver arguments:{}, cuda or xpu count:{}".
-            format(has_ps_args, device_count))
+            "Run parameter-sever mode. pserver arguments:{}, accelerators count:{}".
+            format(has_ps_args, accelerators))
         has_ps_heter_args = list(set(has_ps_args) & set(ps_heter_args))
         if len(has_ps_heter_args) > 0:
             return DistributeMode.PS_HETER
         else:
             return DistributeMode.PS
     elif len(has_collective_args) > 0:
-        logger.info("Run collective gpu mode. gpu arguments:{}, cuda count:{}".
-                    format(has_collective_args, device_count))
+        logger.info("Run collective mode. gpu arguments:{}, cuda count:{}".
+                    format(has_collective_args, accelerators))
         return DistributeMode.COLLECTIVE
     else:
         if not fluid.core.is_compiled_with_cuda(
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index c5cb1ec94ac3d0..2d2807bce28156 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -52,6 +52,8 @@ class DeviceMode():
     GPU = 1
     KUNLUN = 2
     XPU = 2
+    ASCEND_NPU = 3
+    UNKNOWN = 3
 
 
 class Cluster(object):
@@ -98,6 +100,14 @@ def trainers_endpoints(self):
                 r.append(t.endpoint)
         return r
 
+    def world_device_ids(self):
+        r = []
+        for pod in self.pods:
+            for t in pod.trainers:
+                str_accelerators = [str(acc) for acc in t.accelerators]
+                r.append(str_accelerators)
+        return r
+
     def pods_endpoints(self):
         r = []
         for pod in self.pods:
@@ -105,7 +115,6 @@ def pods_endpoints(self):
             assert pod.port != None and pod.addr != None, "{} not a valid endpoint".format(
                 ep)
             r.append(ep)
-
         return r
 
     def get_pod_by_id(self, pod_id):
@@ -132,23 +141,23 @@ def __ne__(self, j):
 
 class Trainer(object):
     def __init__(self):
-        self.gpus = []
+        self.accelerators = []
         self.endpoint = None
         self.rank = None
 
     def __str__(self):
-        return "gpu:{} endpoint:{} rank:{}".format(self.gpus, self.endpoint,
-                                                   self.rank)
+        return "accelerator:{} endpoint:{} rank:{}".format(
+            self.accelerators, self.endpoint, self.rank)
 
     def __eq__(self, t):
-        if len(self.gpus) != len(t.gpus):
+        if len(self.accelerators) != len(t.accelerators):
             return False
 
         if self.endpoint != t.endpoint or \
                 self.rank != t.rank:
             return False
 
-        for a, b in zip(self.gpus, t.gpus):
+        for a, b in zip(self.accelerators, t.accelerators):
             if a != b:
                 return False
 
@@ -171,12 +180,13 @@ def __init__(self):
         self.servers = []
         self.workers = []
         self.heter_workers = []
-        self.gpus = []
+        self.accelerators = []
+        self.device_mode = None
 
     def __str__(self):
-        return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{} servers:{} \
+        return "rank:{} id:{} addr:{} port:{} visible_accelerator:{} trainers:{} servers:{} \
             workers:{} heter_workers:{}".format(
-            self.rank, self.id, self.addr, self.port, self.gpus, [
+            self.rank, self.id, self.addr, self.port, self.accelerators, [
                 str(t) for t in self.trainers
             ], [str(s) for s in self.servers], [str(w) for w in self.workers],
             [str(h) for h in self.heter_workers])
@@ -231,12 +241,12 @@ def parse_response(self, res_pods):
     def rank(self):
         return self.rank
 
-    def get_visible_gpus(self):
+    def get_visible_accelerators(self):
         r = ""
-        for g in self.gpus:
+        for g in self.accelerators:
             r += "{},".format(g)
 
-        assert r != "", "this pod {} can't see any gpus".format(self)
+        assert r != "", "this pod {} can't see any accelerators".format(self)
 
         r = r[:-1]
         return r
@@ -264,23 +274,27 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode,
         pod = Pod()
         pod.rank = node_rank
         pod.addr = ip
+        pod.device_mode = device_mode
+
         cur_node_endpoints = trainer_endpoints[node_rank]
         # when use paddlecloud, endpoints may > devices_per_proc(user_defined)
         assert len(cur_node_endpoints) >= len(
             devices_per_proc
-        ), "current trainer_endpoints size should be greater equal than selected_gpus size."
+        ), "current trainer_endpoints size should be greater equal than acclerators size."
         for i in range(len(devices_per_proc)):
             trainer = Trainer()
-            if device_mode == DeviceMode.GPU:
+            if device_mode == DeviceMode.GPU or device_mode == DeviceMode.ASCEND_NPU:
                 if isinstance(devices_per_proc[i], (list, tuple)):
-                    trainer.gpus.extend(devices_per_proc[i])
+                    trainer.accelerators.extend(devices_per_proc[i])
+                    pod.accelerators.extend(devices_per_proc[i])
                 else:
-                    trainer.gpus.append(devices_per_proc[i])
+                    trainer.accelerators.append(devices_per_proc[i])
+                    pod.accelerators.append(devices_per_proc[i])
             elif device_mode == DeviceMode.XPU:
                 if isinstance(devices_per_proc[i], (list, tuple)):
-                    trainer.gpus.extend(devices_per_proc[i])
+                    trainer.accelerators.extend(devices_per_proc[i])
                 else:
-                    trainer.gpus.append(devices_per_proc[i])
+                    trainer.accelerators.append(devices_per_proc[i])
             trainer.endpoint = "%s" % (cur_node_endpoints[i])
             trainer.rank = trainer_rank
             trainer_rank += 1
@@ -451,21 +465,32 @@ def start_local_trainers(cluster,
     current_env.pop("http_proxy", None)
     current_env.pop("https_proxy", None)
 
+    ids = cluster.world_device_ids()
+    res = [':'.join(ele) for ele in ids]
     procs = []
     for idx, t in enumerate(pod.trainers):
         proc_env = {
             "PADDLE_TRAINER_ID": "%d" % t.rank,
             "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
             "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
-            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
+            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
+            "PADDLE_RANK_IN_NODE": str(idx),
+            "PADDLE_LOCAL_DEVICE_IDS":
+            ",".join([str(acc) for acc in t.accelerators]),
+            "PADDLE_WORLD_DEVICE_IDS": ",".join(res),
         }
 
-        if fluid.core.is_compiled_with_cuda() and len(t.gpus) > 0:
+        if len(t.accelerators) > 0 and pod.device_mode == DeviceMode.GPU:
             proc_env["FLAGS_selected_gpus"] = "%s" % ",".join(
-                [str(g) for g in t.gpus])
-        elif fluid.core.is_compiled_with_xpu() and len(t.gpus) > 0:
+                [str(g) for g in t.accelerators])
+
+        if len(t.accelerators) > 0:
+            proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join(
+                [str(g) for g in t.accelerators])
+        # to do: same code style in future
+        if fluid.core.is_compiled_with_xpu() and len(t.accelerators) > 0:
             proc_env["FLAGS_selected_xpus"] = "%s" % ",".join(
-                [str(g) for g in t.gpus])
+                [str(g) for g in t.accelerators])
 
         current_env.update(proc_env)
 
@@ -564,6 +589,17 @@ def watch_local_trainers(procs, nranks):
     return alive
 
 
+def get_ascend_npus(npus):
+    if npus is None:
+        count = fluid.core.NPUDevice.get_device_count()
+        if count <= 0:
+            return ret
+        ret = [x for x in range(count)]
+    else:
+        ret = [x.strip() for x in npus.split(',')]
+    return ret
+
+
 def get_gpus(gpus):
     if gpus is None:
         gpus_num = fluid.core.get_cuda_device_count()
@@ -623,11 +659,17 @@ def get_xpus(xpus):
 
 
 def get_device_mode():
-    if fluid.core.is_compiled_with_cuda() and fluid.core.get_cuda_device_count(
-    ) > 0:
-        print("launch train in GPU mode")
+    if fluid.core.is_compiled_with_ascend() and \
+            fluid.core.NPUDevice.get_device_count() > 0:
+        print("launch train in ascend npu mode!")
+        return DeviceMode.ASCEND_NPU
+
+    if fluid.core.is_compiled_with_cuda() and \
+            fluid.core.get_cuda_device_count() > 0:
+        print("launch train in GPU mode!")
         return DeviceMode.GPU
-    elif fluid.core.is_compiled_with_xpu() and fluid.core.get_xpu_device_count(
+
+    if fluid.core.is_compiled_with_xpu() and fluid.core.get_xpu_device_count(
     ) > 0:
         print("launch train in XPU mode")
         return DeviceMode.XPU
@@ -654,6 +696,10 @@ def get_device_proc_info(args):
             ]
         else:
             devices_per_proc = gpus
+    elif device_mode == DeviceMode.ASCEND_NPU:
+        npus = get_ascend_npus(args.ascend_npus)
+        assert args.nproc_per_node is None, "ascend_npus need't nproc_per_node arguments"
+        devices_per_proc = npus
     elif device_mode == DeviceMode.XPU:
         xpus = get_xpus(args.xpus)
         if args.nproc_per_node is not None:
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
old mode 100644
new mode 100755
index dba3c944f70ab8..02505e01197dc6
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -59,6 +59,7 @@ def _init_wrapped_opt(self):
         is_distributed = self.role_maker._worker_num() > 1
         if self.user_defined_strategy.sharding:
             # FIXME(wangxi). sharding failed when split check_finite_and_unscale
+            # FIXME(JZ-LIANG). To support Sharding-Megatron-AMP, Megatron should follow Sharding's behavior that to disable is_distributed.
             is_distributed = False
         self.wrapped_opt._set_distributed(is_distributed)
 
diff --git a/paddle/fluid/train/demo/clean.sh b/python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py
old mode 100755
new mode 100644
similarity index 81%
rename from paddle/fluid/train/demo/clean.sh
rename to python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py
index a2064492c08b84..b9a7651e449096
--- a/paddle/fluid/train/demo/clean.sh
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py
@@ -1,6 +1,4 @@
-#!/bin/bash
-
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,8 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-set -x
-cd "$(dirname "$0")"
-rm -rf build/
-set +x
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
index d7ac81bb5c584a..978899604eaf8c 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
@@ -12,16 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import paddle.fluid.framework as framework
 from paddle.fluid.optimizer import Optimizer
 import paddle.fluid.core as core
 import numpy as np
-import ascend_parser
+from . import ascend_parser
+from paddle.distributed import fleet
+import hccl.manage.api as hccl
+from collections import namedtuple
+
+HcomGroupConfig = namedtuple('HcomGroupConfig', ['name', 'nranks', 'rank_ids'])
 
 
 class AscendIRParser(object):
-    def __init__(self):
+    def __init__(self, auto_dp=False, world_rank_size=1):
         self.graph_idx = 0
+        self.hcom_endpoints = {}
+        self.groups_to_create = []
+        self._auto_dp = auto_dp
+        self._world_rank_size = world_rank_size
 
     def _construct_input_map(self, input_varlist):
         ret_map = {}
@@ -43,15 +53,52 @@ def _construct_input_map(self, input_varlist):
                 ret_map[var.name] = ge_input
         return ge_in_operator, ret_map
 
+    def _endpoint_to_world_rank_id(self, endpoint):
+        world_endpoints = fleet.worker_endpoints()
+        assert endpoint in world_endpoints, "endpoint (%s) not in worker_endpoints (%s) " % (
+            endpoint, fleet.world_device_ids())
+        return world_endpoints.index(endpoint)
+
     def parse_op(self, op):
-        if op.type in ascend_parser.registerd_op:
-            print("Op[%s] has been registered, begin to parse it" % (op.type))
+        if op.type == 'c_gen_nccl_id':
+            endpoint = op.attr("endpoint")
+            other_endpoints = op.attr("other_endpoints")
+            rank = op.attr("rank")
+
+            nccl_id = op.output_arg_names[0]
+
+            # c_gen_nccl_id operator splits endpoints into local endpoint and other_endpoints
+            # we should combine these together to produce world_rank_ids 
+            self.hcom_endpoints[nccl_id] = other_endpoints[:]
+            self.hcom_endpoints[nccl_id].insert(rank, endpoint)
+
+            print("nccl_id (%s) registered endpoints %s" %
+                  (nccl_id, self.hcom_endpoints[nccl_id]))
+        elif op.type == 'c_comm_init':
+            nccl_id = op.input_arg_names[0]
+            nranks = op.attr("nranks")
+            assert nranks == len(self.hcom_endpoints[
+                nccl_id]), "nranks doesn't match endpoint count"
+            rank = op.attr("rank")
+            ring_id = op.attr("ring_id")
+
+            group_name = "hcom_group_" + str(ring_id)
+            global_rank_ids = [
+                self._endpoint_to_world_rank_id(endpoint)
+                for endpoint in self.hcom_endpoints[nccl_id]
+            ]
+            self.groups_to_create.append(
+                HcomGroupConfig(
+                    name=group_name, nranks=nranks, rank_ids=global_rank_ids))
+            print("append to create group: %s, with rank_ids: %s" %
+                  (group_name, global_rank_ids))
+        elif op.type in ascend_parser.registerd_op:
             op_parser = self.parser_factory.create_parse(
                 ascend_parser.registerd_op[op.type])
             op_parser.apply(op)
         else:
-            print("Op[%s] has not been registered, so we have to skip it" %
-                  (op.type))
+            assert False, "Op[%s] has not been registered, so we have to skip it" % (
+                op.type)
 
     def _parse_program(self,
                        graph_name,
@@ -84,7 +131,7 @@ def _parse_program(self,
                 name = e.name
             ge_out_operator.append(self.var2geop[name])
 
-        # (Debug) If you want to print back prop vars, append/assign the varname in ge_out_operator here, such as: 
+        # (Debug) If you want to print back prop vars, append/assign the varname in ge_out_operator here, such as:
         # if graph_name == "main":
         #     ge_out_operator.append(self.var2geop["reduce_sum_0.tmp_0@GRAD"])
 
@@ -115,6 +162,17 @@ def parse_program(self, startup_program, main_program, input_varlist,
         startup_graph = self._parse_program("startup", startup_program)
         main_graph = self._parse_program("main", main_program, input_varlist,
                                          fetch_list)
+        if self._auto_dp and self._world_rank_size > 1:
+            assert len(self.groups_to_create
+                       ) == 0, "can't parse program under auto_dp mode"
+
+            from paddle.distributed import fleet
+            self.groups_to_create.append(
+                HcomGroupConfig(
+                    name="hcom_group_0",
+                    nranks=fleet.world_size(),
+                    rank_ids=[x for x in range(fleet.world_size())]))
+
         return startup_graph, main_graph
 
 
@@ -124,9 +182,14 @@ class AscendOptimizer(Optimizer):
     def __init__(self, optimizer, fetch_list=[]):
         self.inner_opt = optimizer
         self.fetch_list = fetch_list
+        self.ascend_instance = None
 
     def __del__(self):
+        print("begin AscendOptimizer del")
+        if self.ascend_instance is not None:
+            self.ascend_instance.destroy_global_resources()
         core.ge_finalize()
+        print("end AscendOptimizer del")
 
     def _can_apply(self):
         if not self.user_defined_strategy.ascend:
@@ -138,7 +201,7 @@ def _disable_strategy(self, dist_strategy):
         dist_strategy.ascend = False
         dist_strategy.ascend_configs = {}
 
-    def _get_input_varlist(program):
+    def _get_input_varlist(self, program):
         ret_list = []
         for var in program.list_vars():
             if var.is_data or var.persistable:
@@ -149,30 +212,56 @@ def minimize(self,
                  loss,
                  startup_program=None,
                  parameter_list=None,
-                 no_grad_set=None):
-        minimized = self.inner_opt.minimize(
-            loss, startup_program=startup_program)
+                 no_grad_set=None,
+                 auto_dp=False,
+                 rank_table_file=None):
+        minimized = None
+        if self.inner_opt:
+            minimized = self.inner_opt.minimize(
+                loss, startup_program=startup_program)
 
         self.ascend_instance = core.AscendInstance()
 
+        from paddle.distributed import fleet
+        if auto_dp and fleet.world_size() > 1:
+            from paddle.fluid.transpiler import ascend_transpiler
+            t = ascend_transpiler.AscendTranspiler(startup_program,
+                                                   loss.block.program)
+            t.transpile()
+            #print(loss.block.program)
+
         # Config about Graph Engine can be found in https://support.huaweicloud.com/
         config = {
-            "ge.exec.deviceId": "0",
+            "ge.exec.deviceId": str(fleet.local_device_ids()),
             "ge.graphRunMode": "1",
-            "ge.exec.precision_mode": "must_keep_origin_dtype"
+            "ge.exec.precision_mode": "must_keep_origin_dtype",
         }
+        # if multi trainers
+        if rank_table_file and fleet.world_size() > 1:
+            config["ge.exec.rankTableFile"] = rank_table_file
+            config["ge.exec.rankId"] = str(fleet.worker_index())
+            config["ge.exec.isUseHcom"] = "1"
+            config["ge.exec.deployMode"] = "0"
+        print("ge_initialize config:", config)
         core.ge_initialize(config)
 
         # Init Session
         self.ascend_instance.init_global_resources()
 
         main_block = loss.block
-        self.parser = AscendIRParser()
+        self.parser = AscendIRParser(
+            auto_dp=auto_dp, world_rank_size=fleet.world_size())
+
+        input_varlist = self._get_input_varlist(main_block.program)
 
-        input_varlist = _get_input_varlist(main_block.program)
         startup_graph, main_graph = self.parser.parse_program(
             startup_program, main_block.program, input_varlist, self.fetch_list)
 
+        for cfg in self.parser.groups_to_create:
+            print("create group (%s), nranks: %d, rank_ids: %s" %
+                  (cfg.name, cfg.nranks, cfg.rank_ids))
+            hccl.create_group(cfg.name, cfg.nranks, cfg.rank_ids)
+
         self.ascend_instance.add_ascend_subgraph(0, startup_graph)
         self.ascend_instance.add_ascend_subgraph(1, main_graph)
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
index 2c5930c5b9f2fc..f2ecaf4843829e 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
@@ -1,41 +1,106 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import paddle.fluid.framework as framework
 from paddle.fluid.optimizer import Optimizer
 import paddle.fluid.core as core
 import numpy as np
-
-registerd_op = {
-    "elementwise_add": "AddParser",
-    "matmul": "MatMulParser",
-    "mul": "MulParser",
-    "relu": "ReluParser",
-    "softmax_with_cross_entropy": "SoftmaxWithCrossEntropyParser",
-    "shape": "ShapeParser",
-    "fill_constant": "FillConstantParser",
-    "reduce_sum": "ReduceSumParser",
-    "reduce_sum_grad": "ReduceSumGradParser",
-    "matmul_grad": "MatMulGradParser",
-    "mul_grad": "MulGradParser",
-    "reshape2": "ReshapeParser",
-    "scale": "ScaleParser",
-    "relu_grad": "ReluGradParser",
-    "softmax_with_cross_entropy_grad": "SoftmaxWithCrossEntropyGradParser",
-    "truncated_gaussian_random": "TruncatedNormalParser",
-    "sgd": "SGDParser"
-}
+from paddle.distributed import fleet
+from functools import reduce
+
+registerd_op = {## forwards
+                "elementwise_add": "AddParser",
+                "matmul": "MatMulParser",
+                "mul": "MulParser",
+                "relu": "ReluParser",
+                "softmax_with_cross_entropy": "SoftmaxWithCrossEntropyParser",
+                "shape": "ShapeParser",
+                "fill_constant": "FillConstantParser",
+                "reduce_sum": "ReduceSumParser",
+                "elementwise_mul": "DotMulParser",
+                "elementwise_div": "DotDivParser",
+                "elementwise_pow": "DotPowParser",
+                "elementwise_max": "MaxParser",
+                "elementwise_min": "MinParser",
+                "elementwise_sub": "DotSubParser",
+                "pow": "PowParser",
+                "gelu": "GeluParser",
+                "sqrt": "SqrtParser",
+                "log": "LogParser",
+                "sum": "SumParser",
+                "logical_not": "LogicalNotParser",
+                "gather": "GatherParser",
+                "scatter": "ScatterParser",
+                "cast": "CastParser",
+                "tanh": "TanhParser",
+                "stack": "StackParser",
+                "square": "SquareParser",
+                "unsqueeze2": "UnSqueezeParser",
+                "assign": "AssignParser",
+                "softmax": "SoftMaxParser",
+                "reshape2": "ReshapeParser",
+                "transpose2": "TransposeParser",
+                "layer_norm": "LayerNormParser",
+                "less_than": "LessParser",
+                "mean": "MeanParser",
+                "scale": "ScaleParser",
+                "slice": "SliceParser",
+                "top_k": "TopkParser",
+                "accuracy": "AccuracyParser",
+                #"increment": "IncrementParser",
+                "lookup_table": "LookupTableParser",
+                "truncated_gaussian_random": "TruncatedNormalParser",
+                "c_allgather": "AllGatherParser",
+                "c_allreduce_sum": "AllReduceSumParser",
+                "c_allreduce_max": "AllReduceMaxParser",
+                "c_broadcast": "BroadcastParser",
+                "c_reduce_scatter": "ReduceScatterParser",
+                "c_send": "SendParser",
+                "c_receive": "ReceiveParser",
+                "uniform_random": "UniformRandomParser",
+                "range": "RangeParser",
+                "equal": "EqualParser",
+                "expand": "ExpandParser",
+                "squeeze2": "SqueezeParser",
+
+
+                ## backwords
+                "matmul_grad": "MatMulGradParser",
+                "mul_grad": "MulGradParser",
+                "relu_grad": "ReluGradParser",
+                "reduce_sum_grad": "ReduceSumGradParser",
+                "softmax_with_cross_entropy_grad": "SoftmaxWithCrossEntropyGradParser",
+                "tanh_grad":"TanhGradParser",
+                "log_grad":"LogGradParser",
+                "pow_grad": "PowGradParser",
+                "sqrt_grad": "SqrtGradParser",
+                "gelu_grad": "GeluGradParser",
+                "mean_grad": "MeanGradParser",
+                'lookup_table_grad': "LookUpTableGradParser",
+                "elementwise_mul_grad": "DotMulGradParser",
+                "elementwise_add_grad": "DotAddGradParser",
+                "elementwise_div_grad": "DotDivGradParser",
+                "softmax_grad": "SoftmaxGradParser",
+                "slice_grad": "SliceGradParser",
+                "reshape2_grad": "ReshapeGradParser",
+                "gather_grad": "GatherGradParser",
+                "transpose2_grad": "TransposeGradParser",
+                "layer_norm_grad": "LayerNormGradParser",
+
+                ## opt
+                "sgd": "SGDParser",
+                #"adam": "AdamParser",
+                }
 global_cnt = -1
 global_input_cnt = -1
 
@@ -60,6 +125,7 @@ def __init__(self):
             5: "float32",
             6: "float64"
         }
+        self.dtype2paddle_inv_map = {"VarType.FP32": 0, "VarType.FP16": 1}
 
     def dtype2ge(self, dtype):
         assert dtype in self.dtype2ge_map, "dtype[%d] is not supported %d" % (
@@ -105,7 +171,6 @@ def update_output(self, geop_list, index_list):
             self.parser_name, len(index_list), output_num)
         for output_id in range(output_num):
             arguments = self.op.output(self.op.output_names[output_id])
-            print("%d argument:  %s" % (output_id, str(arguments)))
             if len(arguments) > 0:
                 assert len(arguments) == len(
                     index_list[output_id]
@@ -113,8 +178,6 @@ def update_output(self, geop_list, index_list):
                     self.parser_name, output_id, len(index_list[output_id]),
                     len(arguments))
                 for i in range(len(arguments)):
-                    print("assgin index_list[%d][%d] to %s" %
-                          (output_id, i, arguments[i]))
                     self.var2geop[arguments[i]] = geop_list[index_list[
                         output_id][i]]
 
@@ -125,7 +188,7 @@ def apply(self, op):
         self.op = op
         assert self.op.type == self.parser_name, "op [%s] != parser_name[%s]" % (
             self.op.type, self.parser_name)
-        print("begin to parse op %s" % (self.parser_name))
+        #print("begin to parse op %s" % (self.parser_name))
         geop_list, index_list = self._apply()
         self.update_output(geop_list, index_list)
 
@@ -152,6 +215,63 @@ def _create_ge_tensor(self, shape, dtype, value):
         tensor.set_data(data_8)
         return tensor
 
+    def _get_ge_tensor(self, shape, dtype, value_list):
+        tensor_desc = core.GETensorDesc(
+            core.GEShape(shape), core.GEFormat.FORMAT_ND,
+            self.ascend_helper.dtype2ge(dtype))
+        tensor = core.GETensor(tensor_desc)
+
+        data = np.array(value_list).reshape(shape).astype(
+            self.ascend_helper.dtype2np(dtype))
+        buf = data.tobytes()
+        data_8 = np.frombuffer(buf, dtype=np.uint8)
+        tensor.set_data(data_8)
+
+        tensor_const = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
+
+        return tensor_const
+
+    def _get_variable(self, shape, dtype, tensor):
+        if dtype == "int32":
+            type = core.GEDataType.DT_INT32
+        elif dtype == "float32":
+            type = core.GEDataType.DT_FLOAT
+
+        var = core.GEOperatorFactory.create_operator(
+            "variable" + self._accumulated_op_id(), "Variable")
+        var.update_output_desc("y",
+                               core.GETensorDesc(
+                                   core.GEShape(shape), core.GEFormat.FORMAT_ND,
+                                   type))
+        assign = core.GEOperatorFactory.create_operator(
+            "assign" + self._accumulated_op_id(), "Assign").set_input(
+                "value", tensor).set_input("ref", var)
+
+        return assign
+
+    def _create_shape_tensor(self):
+        tensor_desc = core.GETensorDesc(
+            core.GEShape([2]), core.GEFormat.FORMAT_ND,
+            core.GEDataType.DT_INT32)
+        tensor = core.GETensor(tensor_desc)
+
+        data = np.ones((2)).astype("int32").reshape([2])
+        data[0] = 64
+        buf = data.tobytes()
+        data_8 = np.frombuffer(buf, dtype=np.uint8)
+        tensor.set_data(data_8)
+        return tensor
+
+    def _get_GEtensor_shape(self, tensor):
+        tensor_shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", tensor)
+        tensor_shape = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", tensor_shape).set_attr_int32("dst_type", 0)
+        return tensor_shape
+
 
 class AddParser(AscendParserBase):
     def __init__(self, graph, var2geop):
@@ -162,109 +282,276 @@ def _apply(self):
         x = self._get_ge_input(self.op.input_arg_names[0])
         y = self._get_ge_input(self.op.input_arg_names[1])
         add = core.GEOperatorFactory.create_operator(
-            "add" + self._accumulated_op_id(), "Add").set_input(
-                "x1", x).set_input("x2", y)
+            "add" + self._accumulated_op_id(),
+            "Add").set_input("x1", x).set_input("x2", y)
         return [add], [[0]]
 
 
-class ReduceSumParser(AscendParserBase):
+class DotSubParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(ReduceSumParser, self).__init__(graph, var2geop)
-        self.parser_name = "reduce_sum"
+        super(DotSubParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_sub"
 
     def _apply(self):
         x = self._get_ge_input(self.op.input_arg_names[0])
-        axes = self.op.attr("dim")
-        keep_dims = self.op.attr("keep_dim")
-        reduce_sum = core.GEOperatorFactory.create_operator(
-            "reduce_sum" + self._accumulated_op_id(), "ReduceSumD").set_input(
-                "x", x, 0).set_attr_vec_int32("axes", axes).set_attr_bool(
-                    "keep_dims", keep_dims)
-        return [reduce_sum], [[0]]
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        sub = core.GEOperatorFactory.create_operator(
+            "sub" + self._accumulated_op_id(),
+            "Sub").set_input("x1", x).set_input("x2", y)
+        return [sub], [[0]]
 
 
-class ReduceSumGradParser(AscendParserBase):
+class DotMulParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(ReduceSumGradParser, self).__init__(graph, var2geop)
-        self.parser_name = "reduce_sum_grad"
+        super(DotMulParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_mul"
 
     def _apply(self):
         x = self._get_ge_input(self.op.input_arg_names[0])
-        input = self._get_ge_input(self.op.input_arg_names[1])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        mul = core.GEOperatorFactory.create_operator(
+            "dotmul" + self._accumulated_op_id(),
+            "Mul").set_input("x1", x).set_input("x2", y)
+        return [mul], [[0]]
 
-        shape_tensor = core.GEOperatorFactory.create_operator(
-            "shape" + self._accumulated_op_id(), "Shape").set_input("x", input,
-                                                                    0)
-        axis_const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", self._create_ge_tensor([1], 2, -1))
-        self._mark_as_input(axis_const)
 
-        broadcast = core.GEOperatorFactory.create_operator(
-            "broadcast_to_d" + self._accumulated_op_id(),
-            "BroadcastTo").set_input("x", x).set_input("shape", shape_tensor)
-        # unsqueeze cannot get right result, but ExpandDims seems have the same functionality.
-        reduce_sum_grad = core.GEOperatorFactory.create_operator(
-            "expand" + self._accumulated_op_id(), "ExpandDims").set_input(
-                "x", broadcast).set_input("axis", axis_const)
-        return [shape_tensor, axis_const, broadcast, reduce_sum_grad], [[3]]
+class DotDivParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(DotDivParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_div"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        div = core.GEOperatorFactory.create_operator(
+            "dotdiv" + self._accumulated_op_id(),
+            "Div").set_input("x1", x).set_input("x2", y)
+        return [div], [[0]]
 
 
-class MatMulParser(AscendParserBase):
+class DotPowParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(MatMulParser, self).__init__(graph, var2geop)
-        self.parser_name = "matmul"
+        super(DotPowParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_pow"
 
     def _apply(self):
-        x1 = self._get_ge_input(self.op.input_arg_names[0])
-        x2 = self._get_ge_input(self.op.input_arg_names[1])
-        matmul = core.GEOperatorFactory.create_operator(
-            "matmul" + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", x1).set_input("x2", x2)
-        return [matmul], [[0]]
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        pow = core.GEOperatorFactory.create_operator(
+            "dotpow" + self._accumulated_op_id(),
+            "Pow").set_input("x1", x1).set_input("x2", y)
+        return [pow], [[0]]
 
 
-class MatMulGradParser(AscendParserBase):
+class LessParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(MatMulGradParser, self).__init__(graph, var2geop)
-        self.parser_name = "matmul_grad"
+        super(LessParser, self).__init__(graph, var2geop)
+        self.parser_name = "less_than"
 
     def _apply(self):
-        out_grad = self._get_ge_input(self.op.input_arg_names[0])
-        x = self._get_ge_input(self.op.input_arg_names[1])
-        y = self._get_ge_input(self.op.input_arg_names[2])
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        less_than = core.GEOperatorFactory.create_operator(
+            "less_than" + self._accumulated_op_id(),
+            "Less").set_input("x1", x).set_input("x2", y)
+        return [less_than], [[0]]
 
-        x_grad = core.GEOperatorFactory.create_operator(
-            self.parser_name + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", out_grad).set_input("x2", y).set_attr_bool(
-                    "transpose_x1", False).set_attr_bool("transpose_x2", True)
-        y_grad = core.GEOperatorFactory.create_operator(
-            self.parser_name + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", x).set_input("x2", out_grad).set_attr_bool(
-                    "transpose_x1", True).set_attr_bool("transpose_x2", False)
-        return [x_grad, y_grad], [[0], [1]]
 
+class MaxParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MaxParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_max"
 
-class MulGradParser(AscendParserBase):
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        max_out = core.GEOperatorFactory.create_operator(
+            "max" + self._accumulated_op_id(),
+            "Maximum").set_input("x1", x).set_input("x2", y)
+        return [max_out], [[0]]
+
+
+class MinParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(MulGradParser, self).__init__(graph, var2geop)
-        self.parser_name = "mul_grad"
+        super(MinParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_min"
 
     def _apply(self):
-        out_grad = self._get_ge_input(self.op.input_arg_names[0])
-        x = self._get_ge_input(self.op.input_arg_names[1])
-        y = self._get_ge_input(self.op.input_arg_names[2])
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        min_out = core.GEOperatorFactory.create_operator(
+            "min" + self._accumulated_op_id(),
+            "Minimum").set_input("x1", x).set_input("x2", y)
+        return [min_out], [[0]]
 
-        x_grad = core.GEOperatorFactory.create_operator(
-            self.parser_name + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", out_grad).set_input("x2", y).set_attr_bool(
-                    "transpose_x1", False).set_attr_bool("transpose_x2", True)
-        y_grad = core.GEOperatorFactory.create_operator(
-            self.parser_name + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", x).set_input("x2", out_grad).set_attr_bool(
-                    "transpose_x1", True).set_attr_bool("transpose_x2", False)
 
-        return [x_grad, y_grad], [[0], [1]]
+## cal
+class LogParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LogParser, self).__init__(graph, var2geop)
+        self.parser_name = "log"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        log = core.GEOperatorFactory.create_operator(
+            "log" + self._accumulated_op_id(), "Log").set_input("x", x)
+        return [log], [[0]]
+
+
+class SqrtParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SqrtParser, self).__init__(graph, var2geop)
+        self.parser_name = "sqrt"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        sqrt = core.GEOperatorFactory.create_operator(
+            "sqrt" + self._accumulated_op_id(), "Sqrt").set_input("x", x)
+        return [sqrt], [[0]]
+
+
+class PowParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(PowParser, self).__init__(graph, var2geop)
+        self.parser_name = "pow"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        factor = self.op.attr("factor")
+        pow_value = core.GEOperatorFactory.create_operator(
+            "pow" + self._accumulated_op_id(),
+            "Power").set_input("x", x).set_attr_float(
+                "power", factor).set_attr_float("scale", 1.0).set_attr_float(
+                    "shift", 0.0)
+        return [pow_value], [[0]]
+
+
+class SquareParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SquareParser, self).__init__(graph, var2geop)
+        self.parser_name = "square"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        square = core.GEOperatorFactory.create_operator(
+            "square" + self._accumulated_op_id(), "Square").set_input("x", x)
+        return [square], [[0]]
+
+
+class SumParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SumParser, self).__init__(graph, var2geop)
+        self.parser_name = "sum"
+
+    def _apply(self):
+        len_list = len(self.op.input_arg_names)
+        if len_list < 2:
+            assert False, "the size of input list must large or equal 2"
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        sum = core.GEOperatorFactory.create_operator(
+            "sum" + self._accumulated_op_id(),
+            "Add").set_input("x1", x).set_input("x2", y)
+        for i in range(2, len_list):
+            y = self._get_ge_input(self.op.input_arg_names[i])
+            sum = core.GEOperatorFactory.create_operator(
+                "sum" + self._accumulated_op_id(),
+                "Add").set_input("x1", sum).set_input("x2", y)
+        return [sum], [[0]]
+
+
+class LogicalNotParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LogicalNotParser, self).__init__(graph, var2geop)
+        self.parser_name = "logical_not"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        logical_not = core.GEOperatorFactory.create_operator(
+            "logical_not" + self._accumulated_op_id(),
+            "LogicalNot").set_input("x", x)
+        return [logical_not], [[0]]
+
+
+class MeanParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MeanParser, self).__init__(graph, var2geop)
+        self.parser_name = "mean"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        mean = core.GEOperatorFactory.create_operator(
+            "mean" + self._accumulated_op_id(),
+            "ReduceMeanD").set_input("x", x).set_attr_bool(
+                "keep_dims", False).set_attr_vec_int32("axes", [])
+        return [mean], [[0]]
+
+
+class ReduceSumParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReduceSumParser, self).__init__(graph, var2geop)
+        self.parser_name = "reduce_sum"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        axes = self.op.attr("dim")
+        keep_dims = self.op.attr("keep_dim")
+        reduce_all = self.op.attr("reduce_all")
+        x_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        if reduce_all:
+            axes = list(range(len(x_shape)))
+        reduce_sum = core.GEOperatorFactory.create_operator(
+            "reduce_sum" + self._accumulated_op_id(),
+            "ReduceSumD").set_input("x", x, 0).set_attr_vec_int32(
+                "axes", axes).set_attr_bool("keep_dims", keep_dims)
+        return [reduce_sum], [[0]]
+
+
+#class IncrementParser(AscendParserBase):
+#    def __init__(self, graph, var2geop):
+#        super(IncrementParser, self).__init__(graph, var2geop)
+#        self.parser_name = "increment"
+#
+#    def _apply(self): 
+#        x = self._get_ge_input(self.op.input_arg_names[0])
+#        step = self.op.attr("step") #self._get_ge_input(self.op.input_arg_names[1])
+#        print("step: ", step)
+#            
+#        increment = core.GEOperatorFactory.create_operator("adds" + self._accumulated_op_id(), "Adds").set_input("x", x).set_attr_float("value", step) #set_input("x2", bias)
+#        
+#        return [increment]
+
+
+## matrix cal
+class MatMulParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MatMulParser, self).__init__(graph, var2geop)
+        self.parser_name = "matmul"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        y = self._get_ge_input(self.op.input_arg_names[1])
+        transpose_x = self.op.attr("transpose_X")
+        transpose_y = self.op.attr("transpose_Y")
+
+        x1_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        x2_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+
+        if len(x1_shape) > 2:
+            matmul = core.GEOperatorFactory.create_operator(
+                "matmul" + self._accumulated_op_id(), "BatchMatMul").set_input(
+                    "x1", x).set_input("x2", y).set_attr_bool(
+                        "adj_x1",
+                        transpose_x).set_attr_bool("adj_x2", transpose_y)
+        elif len(x1_shape) == 2:
+            matmul = core.GEOperatorFactory.create_operator(
+                "matmul" + self._accumulated_op_id(),
+                "MatMul").set_input("x1", x).set_input("x2", y).set_attr_bool(
+                    "transpose_x1", transpose_x).set_attr_bool("transpose_x2",
+                                                               transpose_y)
+        else:
+            assert False, "not support"
+        return [matmul], [[0]]
 
 
 class MulParser(AscendParserBase):
@@ -275,13 +562,105 @@ def __init__(self, graph, var2geop):
     def _apply(self):
         x = self._get_ge_input(self.op.input_arg_names[0])
         y = self._get_ge_input(self.op.input_arg_names[1])
+        x_num_col_dims = self.op.attr("x_num_col_dims")
+        y_num_col_dims = self.op.attr("y_num_col_dims")
+        shape_x1 = self.op.block.var(self.op.input_arg_names[0]).shape
+        shape_x2 = self.op.block.var(self.op.input_arg_names[1]).shape
+
+        if x_num_col_dims == 1 and y_num_col_dims == 1:
+            if len(shape_x1) == 2 and len(shape_x2) == 2:
+                matmul = core.GEOperatorFactory.create_operator(
+                    "mul" + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", x).set_input("x2", y)
+            elif len(shape_x1) == 3 and len(shape_x2) == 2:
+                flatten_x1 = core.GEOperatorFactory.create_operator(
+                    "flatten" + self._accumulated_op_id(),
+                    "Flatten").set_input("x", x)
+                matmul = core.GEOperatorFactory.create_operator(
+                    "mul" + self._accumulated_op_id(), "MatMul").set_input(
+                        "x1", flatten_x1, 0).set_input("x2", y, 0)
+            else:
+                assert False, "not support"
+        else:
+            if len(shape_x1) == 3 and len(shape_x2) == 2:
+                assert x_num_col_dims == 2, "only support 2"
+                flatten_x1 = core.GEOperatorFactory.create_operator(
+                    "flatten" + self._accumulated_op_id(),
+                    "FlattenV2").set_input("x", x).set_attr_int32(
+                        "axis", 0).set_attr_int32("end_axis", 1)
+                matmul_m = core.GEOperatorFactory.create_operator(
+                    "mul" + self._accumulated_op_id(), "MatMul").set_input(
+                        "x1", flatten_x1, 0).set_input("x2", y, 0)
+                matmul_transpose = core.GEOperatorFactory.create_operator(
+                    "transpose" + self._accumulated_op_id(),
+                    "TransposeD").set_input(
+                        "x", matmul_m).set_attr_vec_int32("perm", [1, 0])
+                tensor = self._create_ge_tensor(
+                    [3], 2, [shape_x2[1], shape_x1[0], shape_x1[1]])
+                const_shape = core.GEOperatorFactory.create_operator(
+                    "shape" + self._accumulated_op_id(),
+                    "Const").set_attr_tensor("value", tensor)
+                reshape_matmul = core.GEOperatorFactory.create_operator(
+                    "reshape" + self._accumulated_op_id(), "Reshape").set_input(
+                        "x", matmul_transpose).set_input(
+                            "shape", const_shape).set_attr_int32("axis", 0)
+                matmul = core.GEOperatorFactory.create_operator(
+                    "transpose" + self._accumulated_op_id(),
+                    "TransposeD").set_input(
+                        "x",
+                        reshape_matmul).set_attr_vec_int32("perm", [1, 2, 0])
+            else:
+                assert False, "not support"
 
-        matmul = core.GEOperatorFactory.create_operator(
-            "mul" + self._accumulated_op_id(), "MatMul").set_input(
-                "x1", x).set_input("x2", y)
         return [matmul], [[0]]
 
 
+class LayerNormParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LayerNormParser, self).__init__(graph, var2geop)
+        self.parser_name = "layer_norm"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[2])
+        scale = self._get_ge_input(self.op.input_arg_names[1])
+        bias = self._get_ge_input(self.op.input_arg_names[0])
+        epsilon = self.op.attr("epsilon")
+        begin_norm_axis = self.op.attr("begin_norm_axis")
+        x_dtype = self.op.block.var(self.op.input_arg_names[2]).dtype
+
+        shape_tensor = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", x)
+        scale_expand = core.GEOperatorFactory.create_operator(
+            "broadcast_to_d" + self._accumulated_op_id(),
+            "BroadcastTo").set_input("x",
+                                     scale).set_input("shape", shape_tensor)
+        bias_expand = core.GEOperatorFactory.create_operator(
+            "broadcast_to_d" + self._accumulated_op_id(),
+            "BroadcastTo").set_input("x", bias).set_input("shape", shape_tensor)
+        layer_norm = core.GEOperatorFactory.create_operator(
+            "layer_norm" + self._accumulated_op_id(),
+            "LayerNorm").set_input("x", x).set_input(
+                "gamma",
+                scale_expand).set_input("beta", bias_expand).set_attr_int32(
+                    "begin_norm_axis", begin_norm_axis).set_attr_int32(
+                        "begin_params_axis",
+                        begin_norm_axis).set_attr_float("epsilon", epsilon)
+
+        cast_dtype = 0 if self.ascend_helper.dtype2paddle_inv_map[str(
+            x_dtype)] == 0 else 1
+        y = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", layer_norm, 0).set_attr_int32("dst_type", cast_dtype)
+        mean = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", layer_norm, 1).set_attr_int32("dst_type", cast_dtype)
+        variance = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", layer_norm, 2).set_attr_int32("dst_type", cast_dtype)
+        return [y, mean, variance], [[1], [2], [0]]
+
+
+## activate function
 class ReluParser(AscendParserBase):
     def __init__(self, graph, var2geop):
         super(ReluParser, self).__init__(graph, var2geop)
@@ -294,20 +673,31 @@ def _apply(self):
         return [relu], [[0]]
 
 
-class ReluGradParser(AscendParserBase):
+class GeluParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(ReluGradParser, self).__init__(graph, var2geop)
-        self.parser_name = "relu_grad"
+        super(GeluParser, self).__init__(graph, var2geop)
+        self.parser_name = "gelu"
 
     def _apply(self):
-        out = self._get_ge_input(self.op.input_arg_names[0])
-        out_grad = self._get_ge_input(self.op.input_arg_names[1])
-        relu_grad = core.GEOperatorFactory.create_operator(
-            self.parser_name + self._accumulated_op_id(), "ReluGrad").set_input(
-                "gradients", out_grad).set_input("features", out)
-        return [relu_grad], [[0]]
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        gelu = core.GEOperatorFactory.create_operator(
+            "gelu" + self._accumulated_op_id(), "Gelu").set_input("x", x)
+        return [gelu], [[0]]
+
+
+class TanhParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(TanhParser, self).__init__(graph, var2geop)
+        self.parser_name = "tanh"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        tanh = core.GEOperatorFactory.create_operator(
+            "tanh" + self._accumulated_op_id(), "Tanh").set_input("x", x)
+        return [tanh], [[0]]
 
 
+## loss function
 class SoftmaxWithCrossEntropyParser(AscendParserBase):
     def __init__(self, graph, var2geop):
         super(SoftmaxWithCrossEntropyParser, self).__init__(graph, var2geop)
@@ -316,80 +706,61 @@ def __init__(self, graph, var2geop):
     def _apply(self):
         label = self._get_ge_input(self.op.input_arg_names[0])
         logits = self._get_ge_input(self.op.input_arg_names[1])
-
         cls_num = self.op.block.var(self.op.input_arg_names[1]).shape[1]
+
         softmax = core.GEOperatorFactory.create_operator(
-            "softmax" + self._accumulated_op_id(), "SoftmaxV2").set_input(
-                "x", logits)
+            "softmax" + self._accumulated_op_id(),
+            "SoftmaxV2").set_input("x", logits)
         label = core.GEOperatorFactory.create_operator(
             "cast" + self._accumulated_op_id(), "Cast").set_input(
                 "x", label).set_attr_int32("dst_type", 3)
 
         tensoron = self._create_ge_tensor([1], 5, 1)
-        on_const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensoron)
-        self._mark_as_input(on_const)
+        on = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensoron)
         tensoroff = self._create_ge_tensor([1], 5, 0)
-        off_const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensoroff)
-        self._mark_as_input(off_const)
+        off = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensoroff)
+        self._mark_as_input(on)
+        self._mark_as_input(off)
         onehot = core.GEOperatorFactory.create_operator(
             "onehot" + self._accumulated_op_id(), "OneHotD").set_input(
-                "x", label).set_input("on_value", on_const).set_input(
-                    "off_value", off_const).set_attr_int32("depth", cls_num)
+                "x", label).set_input("on_value", on).set_input(
+                    "off_value", off).set_attr_int32("depth", cls_num)
         squeeze = core.GEOperatorFactory.create_operator(
             "mul" + self._accumulated_op_id(), "Squeeze").set_input("x", onehot)
-        loss = core.GEOperatorFactory.create_operator(
+
+        loss_all = core.GEOperatorFactory.create_operator(
             "loss" + self._accumulated_op_id(),
             "SoftmaxCrossEntropyWithLogits").set_input(
                 "features", logits).set_input("labels", squeeze)
-
-        return [label, softmax, on_const, off_const, onehot, squeeze,
-                loss], [[6], [1]]
+        loss = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", loss_all, 0).set_attr_int32("dst_type", 0)
+        loss_expand = core.GEOperatorFactory.create_operator(
+            "unsqueeze" + self._accumulated_op_id(),
+            "Unsqueeze").set_input("x", loss).set_attr_vec_int32("axes", [1])
+        return [label, softmax, loss_expand], [[2], [1]]
 
 
-class SoftmaxWithCrossEntropyGradParser(AscendParserBase):
+class SoftMaxParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(SoftmaxWithCrossEntropyGradParser, self).__init__(graph, var2geop)
-        self.parser_name = "softmax_with_cross_entropy_grad"
+        super(SoftMaxParser, self).__init__(graph, var2geop)
+        self.parser_name = "softmax"
 
     def _apply(self):
-        label = self._get_ge_input(self.op.input_arg_names[0])
-        loss_grad = self._get_ge_input(self.op.input_arg_names[1])
-        softmax = self._get_ge_input(self.op.input_arg_names[2])
-        cls_num = self.op.block.var(self.op.input_arg_names[2]).shape[1]
+        logits = self._get_ge_input(self.op.input_arg_names[0])
+        axes = self.op.attr("axis")
 
-        tensoron = self._create_ge_tensor([1], 5, 1)
-        on_const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensoron)
-        self._mark_as_input(on_const)
-        tensoroff = self._create_ge_tensor([1], 5, 0)
-        off_const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensoroff)
-        self._mark_as_input(off_const)
-        label = core.GEOperatorFactory.create_operator(
-            "cast" + self._accumulated_op_id(), "Cast").set_input(
-                "x", label).set_attr_int32("dst_type", 3)
-        onehot = core.GEOperatorFactory.create_operator(
-            "onehot" + self._accumulated_op_id(), "OneHotD").set_input(
-                "x", label).set_input("on_value", on_const).set_input(
-                    "off_value", off_const).set_attr_int32("depth", cls_num)
-        # the fuck onehot will add a demension, so must call squeeze afterward
-        squeeze = core.GEOperatorFactory.create_operator(
-            "mul" + self._accumulated_op_id(), "Squeeze").set_input("x", onehot)
-        sub = core.GEOperatorFactory.create_operator(
-            "sub" + self._accumulated_op_id(), "Sub").set_input(
-                "x1", softmax).set_input("x2", squeeze)
-        grad = core.GEOperatorFactory.create_operator(
-            "mul" + self._accumulated_op_id(), "Mul").set_input(
-                "x1", loss_grad).set_input("x2", sub)
-        return [on_const, off_const, label, onehot, squeeze, sub, grad], [[-1]]
+        softmax = core.GEOperatorFactory.create_operator(
+            "softmax" + self._accumulated_op_id(), "SoftmaxV2").set_input(
+                "x", logits).set_attr_vec_int32("axes", [axes])
+        return [softmax], [[0]]
 
 
+## general 
 class ShapeParser(AscendParserBase):
     def __init__(self, graph, var2geop):
         super(ShapeParser, self).__init__(graph, var2geop)
@@ -411,16 +782,15 @@ def _apply(self):
         shape = self.op.attr("shape")
         dtype = self.op.attr("dtype")
         value = self.op.attr("value")
-        print("shape: ", shape)
-        print("dtype: ", dtype)
-        print("value: ", value)
+
         tensor = self._create_ge_tensor(shape, dtype, value)
         const = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor)
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
         self._mark_as_input(const)
         if self.op.block.var(self.op.output('Out')[0]).persistable:
-            print("%s fill_constant" % (self.op.output('Out')[0]))
+            #print("%s is Persistable in fill_constant" %
+            #      (self.op.output('Out')[0]))
             var = core.GEOperatorFactory.create_operator(
                 self.op.output('Out')[0], "Variable")
             var.update_output_desc("y",
@@ -432,26 +802,7 @@ def _apply(self):
                 "assign" + self._accumulated_op_id(), "Assign").set_input(
                     "value", const).set_input("ref", var)
             return [const], [[0]]
-        else:
-            print(
-                "self.op.output('Out')[0] is not persistable in fill_constant")
-            return [const], [[0]]
-
-
-class SGDParser(AscendParserBase):
-    def __init__(self, graph, var2geop):
-        super(SGDParser, self).__init__(graph, var2geop)
-        self.parser_name = "sgd"
-
-    def _apply(self):
-        grad = self._get_ge_input(self.op.input_arg_names[0])
-        lr = self._get_ge_input(self.op.input_arg_names[1])
-        param = self._get_ge_input(self.op.input_arg_names[2])
-        sgd = core.GEOperatorFactory.create_operator(
-            "momentum" + self._accumulated_op_id(),
-            "ApplyGradientDescent").set_input("var", param).set_input(
-                "alpha", lr).set_input("delta", grad)
-        return [sgd], [[0]]
+        return [const], [[0]]
 
 
 class TruncatedNormalParser(AscendParserBase):
@@ -465,30 +816,27 @@ def _apply(self):
         mean = self.op.attr("mean")
         std = self.op.attr("std")
         seed = self.op.attr("seed")
+
         tensor1 = self._create_ge_tensor([len(shape)], 2, shape)
         shape_tensor = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor1)
-
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor1)
         tensor2 = self._create_ge_tensor([1], dtype, mean)
         mean_tensor = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor2)
-
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor2)
         tensor3 = self._create_ge_tensor([1], dtype, std)
         std_tensor = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor3)
-
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor3)
         tensor4 = self._create_ge_tensor([1], dtype, mean - 2 * std)
         min_tensor = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor4)
-
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor4)
         tensor5 = self._create_ge_tensor([1], dtype, mean + 2 * std)
         max_tensor = core.GEOperatorFactory.create_operator(
-            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor5)
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor5)
 
         self._mark_as_input(shape_tensor)
         self._mark_as_input(mean_tensor)
@@ -507,9 +855,8 @@ def _apply(self):
 
         ## wirte the output of truncatedNormal from startup_program to main_program
         if self.op.block.var(self.op.output('Out')[0]).persistable:
-            print("%s is Persistable in truncated_normal" %
-                  (self.op.output('Out')[0]))
-            #var = core.GEOperatorFactory.create_operator(self.op.output('Out')[0], "Variable").set_input("x", truncated_normal)
+            #print("%s is Persistable in truncated_normal" %
+            #      (self.op.output('Out')[0]))
             var = core.GEOperatorFactory.create_operator(
                 self.op.output('Out')[0], "Variable")
             var.update_output_desc("y",
@@ -524,66 +871,1313 @@ def _apply(self):
                 shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor,
                 truncated_normal
             ], [[-1]]
-        else:
-            print(
-                "self.op.output('Out')[0] is not persistable in truncated_noraml"
-            )
-            return [truncated_normal], [[0]]  #[assign]
+        #else:
+        #    print(
+        #        "self.op.output('Out')[0] is not persistable in truncated_noraml"
+        #    )
+        return [truncated_normal], [[0]]
 
 
-class ScaleParser(AscendParserBase):
+class GatherParser(AscendParserBase):
     def __init__(self, graph, var2geop):
-        super(ScaleParser, self).__init__(graph, var2geop)
-        self.parser_name = "scale"
+        super(GatherParser, self).__init__(graph, var2geop)
+        self.parser_name = "gather"
 
     def _apply(self):
-        x = self._get_ge_input(self.op.input_arg_names[0])
-        scale = self.op.attr(
-            "scale")  #self.get_ge_input(self.op.input_arg_names[1])
-        bias = self.op.attr("bias")
-        bias_after_scale = self.op.attr("bias_after_scale")
-        if bias_after_scale:
-            scale_value = core.GEOperatorFactory.create_operator(
-                "scale" + self._accumulated_op_id(), "Power").set_input(
-                    "x", x).set_attr_float("power", 1.0).set_attr_float(
+        index = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        clo = self.op.block.var(self.op.input_arg_names[1]).shape[-1]
+
+        gather = core.GEOperatorFactory.create_operator(
+            "gather" + self._accumulated_op_id(), "Gather").set_input(
+                "x", x).set_input("indices", index).set_attr_bool(
+                    "validate_indices", True)
+        return [gather], [[0]]
+
+
+class ScatterParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ScatterParser, self).__init__(graph, var2geop)
+        self.parser_name = "scatter"
+
+    def _apply(self):
+        index = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        updates = self._get_ge_input(self.op.input_arg_names[2])
+        overwrite = self.op.attr("overwrite")
+        index_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+
+        if len(index_shape) == 1:
+            index = core.GEOperatorFactory.create_operator(
+                "unsqueeze" + self.getid(), "Unsqueeze").set_input(
+                    "x", index).set_attr_vec_int32("axes", [1])
+        if not overwrite:
+            scatter_value = core.GEOperatorFactory.create_operator(
+                "scatter" + self._accumulated_op_id(),
+                "TensorScatterAdd").set_input(
+                    "x", x_var).set_input("indices", index_var).set_input(
+                        "updates", updatesi_var)
+        else:
+            scatter_value = core.GEOperatorFactory.create_operator(
+                "scatter" + self._accumulated_op_id(),
+                "TensorScatterUpdate").set_input(
+                    "x", x_var).set_input("indices", index_var).set_input(
+                        "updates", updates_var)
+        return [x_var, index_var, updates_var, scatter_value], [[-1]]
+
+
+class CastParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(CastParser, self).__init__(graph, var2geop)
+        self.parser_name = "cast"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        dtype = self.op.attr("out_dtype")
+        cast = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", x).set_attr_int32("dst_type", dtype)
+        return [cast], [[0]]
+
+
+class AssignParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(AssignParser, self).__init__(graph, var2geop)
+        self.parser_name = "assign"
+
+    def _apply(self):
+        const = self._get_ge_input(self.op.input_arg_names[0])
+        var = self._get_ge_input(self.op.input_arg_names[1])
+        assign = core.GEOperatorFactory.create_operator(
+            "assign" + self._accumulated_op_id(), "Assign").set_input(
+                "value", const).set_input("ref", var)
+        return [assign], [[0]]
+
+
+class ScaleParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ScaleParser, self).__init__(graph, var2geop)
+        self.parser_name = "scale"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        scale = self.op.attr("scale")
+        bias = self.op.attr("bias")
+        bias_after_scale = self.op.attr("bias_after_scale")
+
+        if bias_after_scale:
+            scale_value = core.GEOperatorFactory.create_operator(
+                "scale" + self._accumulated_op_id(), "Power").set_input(
+                    "x", x).set_attr_float("power", 1.0).set_attr_float(
                         "scale", scale).set_attr_float("shift", bias)
         else:
             x_add_bias = core.GEOperatorFactory.create_operator(
                 "adds" + self._accumulated_op_id(), "Adds").set_input(
-                    "x", x).set_attr_float("value",
-                                           bias)  #set_input("x2", bias)
+                    "x", x).set_attr_float("value", bias)
             scale_value = core.GEOperatorFactory.create_operator(
                 "scale" + self._accumulated_op_id(), "Power").set_input(
-                    "x", x_add_bias).set_attr_float(
-                        "power", 1.0).set_attr_float(
-                            "scale", scale).set_attr_float("shift", 0.0)
-            #tensor_zeros = core.GEOperatorFactory.create_operator("zeroslike" + self.getid(), "ZerosLike").set_input("x", x)
-            #bias_ = self.create_ge_tensor([1], 5, bias)     
-            #const_bias = core.GEOperatorFactory.create_operator("const" + self.getid(), "Const").set_attr_tensor("value", tensor_bias)
+                    "x",
+                    x_add_bias).set_attr_float("power", 1.0).set_attr_float(
+                        "scale", scale).set_attr_float("shift", 0.0)
         return [scale_value], [[0]]
 
 
+class SliceParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SliceParser, self).__init__(graph, var2geop)
+        self.parser_name = "slice"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        axes = self.op.attr("axes")
+        starts = self.op.attr("starts")
+        ends = self.op.attr("ends")
+
+        x_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        len_shape = len(x_shape)
+        axes_cor = list(range(len_shape))
+        starts_cor, ends_cor = [], []
+        cnt = 0
+        for i in range(len_shape):
+            starts_cor.append(starts[cnt] if i in axes else 0)
+            if i in axes and ends[cnt] <= x_shape[i]:
+                ends_cor.append(ends[cnt])
+            else:
+                ends_cor.append(x_shape[i])
+            if i in axes:
+                cnt += 1
+        size = [ends_cor[i] - starts_cor[i] for i in range(len(axes_cor))]
+
+        assert len(axes_cor) == len(starts_cor) == len(
+            ends_cor), "the three fields must have same size"
+        slice_value = core.GEOperatorFactory.create_operator(
+            "slice" + self._accumulated_op_id(), "SliceD").set_input(
+                "x", x).set_attr_vec_int32(
+                    "offsets", starts_cor).set_attr_vec_int32("size", size)
+
+        return [slice_value], [[0]]
+
+
 class ReshapeParser(AscendParserBase):
     def __init__(self, graph, var2geop):
         super(ReshapeParser, self).__init__(graph, var2geop)
         self.parser_name = "reshape2"
 
     def _apply(self):
-        print("swbuf:", self.op.input_arg_names)
+        org_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        assert org_shape.count(-1) == 0, "do not allow the dim is -1"
         shape = self.op.attr("shape")
-        axis = 0
-        if shape[0] == -1:
-            axis = 1
-            shape = shape[1:]
-        print("shape: ", shape)
-        data_x1_shape = self._get_ge_input(self.op.input_arg_names[0])
+        for cnt in range(len(shape)):
+            if shape[cnt] == 0:
+                shape[cnt] = org_shape[cnt]
+
+        if -1 in shape:
+            assert shape.count(-1) == 1, "only allow one dim is -1"
+            mul_res_org = reduce(lambda x, y: x * y, org_shape)
+            mul_res_refine = reduce(lambda x, y: x * y, shape) * -1
+            idx = shape.index(-1)
+            shape[idx] = mul_res_org // mul_res_refine
+
+        x = self._get_ge_input(self.op.input_arg_names[0])
         tensor = self._create_ge_tensor([len(shape)], 2, shape)
         const_shape = core.GEOperatorFactory.create_operator(
-            "shape" + self._accumulated_op_id(), "Const").set_attr_tensor(
-                "value", tensor)
+            "shape" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
         reshape = core.GEOperatorFactory.create_operator(
             "reshape" + self._accumulated_op_id(), "Reshape").set_input(
-                "x", data_x1_shape).set_input(
-                    "shape", const_shape).set_attr_int32("axis", axis)
+                "x",
+                x).set_input("shape", const_shape).set_attr_int32("axis", 0)
+        x_shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", x)
+
+        return [x_shape, reshape], [[1], [0]]
+
+
+class TransposeParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(TransposeParser, self).__init__(graph, var2geop)
+        self.parser_name = "transpose2"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        perm = self.op.attr("axis")
+        transpose = core.GEOperatorFactory.create_operator(
+            "transpose" + self._accumulated_op_id(), "TransposeD").set_input(
+                "x", x).set_attr_vec_int32("perm", perm)
+        x_shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", x)
+
+        return [x_shape, transpose], [[1], [0]]
+
+
+class AccuracyParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(AccuracyParser, self).__init__(graph, var2geop)
+        self.parser_name = "accuracy"
+
+    def _apply(self):
+        pred = self._get_ge_input(self.op.input_arg_names[0])
+        label = self._get_ge_input(self.op.input_arg_names[1])
+        logits = self._get_ge_input(self.op.input_arg_names[2])
+
+        pred = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", pred).set_attr_int32("dst_type", 3)
+        label = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", label).set_attr_int32("dst_type", 3)
+        equal = core.GEOperatorFactory.create_operator(
+            "equal" + self._accumulated_op_id(), "Equal").set_input(
+                "x1", pred).set_input("x2", label)
+        cast = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", equal).set_attr_int32("dst_type", 0)
+        acc = core.GEOperatorFactory.create_operator(
+            "mean" + self._accumulated_op_id(), "ReduceMeanD").set_input(
+                "x", cast).set_attr_bool("keep_dims", False).set_attr_vec_int32(
+                    "axes", [])
+        correct = core.GEOperatorFactory.create_operator(
+            "sum" + self._accumulated_op_id(), "ReduceSumD").set_input(
+                "x", cast).set_attr_bool("keep_dims", False).set_attr_vec_int32(
+                    "axes", [])
+        ones_tensor = core.GEOperatorFactory.create_operator(
+            "oneslike" + self._accumulated_op_id(),
+            "OnesLike").set_input("x", label)
+        ones_tensor = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", ones_tensor).set_attr_int32("dst_type", 0)
+        total = core.GEOperatorFactory.create_operator(
+            "sum" + self._accumulated_op_id(), "ReduceSumD").set_input(
+                "x", ones_tensor).set_attr_bool(
+                    "keep_dims", False).set_attr_vec_int32("axes", [])
+
+        return [acc, correct, total], [[0], [1], [2]]
+
+
+class TopkParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(TopkParser, self).__init__(graph, var2geop)
+        self.parser_name = "top_k"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        k = self.op.attr("k")
+
+        tensor = self._create_ge_tensor([1], 2, k)
+        const_k = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
+        cast_x = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(),
+            "Cast").set_input("x", x).set_attr_int32("dst_type", 1)
+        topk = core.GEOperatorFactory.create_operator(
+            "topk" + self._accumulated_op_id(),
+            "TopK").set_input("x", cast_x).set_input("k", const_k)
+        value = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", topk, 0).set_attr_int32("dst_type", 0)
+        index = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", topk, 1).set_attr_int32("dst_type", 0)
+        return [value, index], [[1], [0]]
+
+
+class LookupTableParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LookupTableParser, self).__init__(graph, var2geop)
+        self.parser_name = "lookup_table"
+
+    def _apply(self):
+        ids = self._get_ge_input(self.op.input_arg_names[0])
+        w = self._get_ge_input(self.op.input_arg_names[1])
+
+        ids_squeeze = core.GEOperatorFactory.create_operator(
+            "squeeze" + self._accumulated_op_id(), "Squeeze").set_input(
+                "x", ids).set_attr_vec_int32("axes", [-1])
+        out = core.GEOperatorFactory.create_operator(
+            "lookup" + self._accumulated_op_id(), "Gather").set_input(
+                "x", w).set_input("indices", ids_squeeze)
+        return [out], [[0]]
+
+
+class StackParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(StackParser, self).__init__(graph, var2geop)
+        self.parser_name = "stack"
+
+    def _apply(self):
+        tiles = len(self.op.input_arg_names)
+        data_x_lst = []
+        for index in range(tiles):
+            data_x_lst.append(
+                self._get_ge_input(self.op.input_arg_names[index]))
+        axis = self.op.attr("axis")
+
+        data_x = data_x_lst[0]
+        tensor = self._create_ge_tensor([1], 2, axis)
+        tensor_axis = core.GEOperatorFactory.create_operator(
+            "axis" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
+        expand = core.GEOperatorFactory.create_operator(
+            "expand" + self._accumulated_op_id(),
+            "ExpandDims").set_input("x", data_x).set_input("axis", tensor_axis)
+
+        stack = core.GEOperatorFactory.create_operator(
+            "stack" + self._accumulated_op_id(),
+            "TileWithAxis").set_input("x", expand).set_attr_int32(
+                "axis", axis).set_attr_int32("tiles", tiles)
+
+        return [stack], [[0]]
+
+
+class UnSqueezeParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(UnSqueezeParser, self).__init__(graph, var2geop)
+        self.parser_name = "unsqueeze2"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        axes = self.op.attr('axes')
+
+        output = core.GEOperatorFactory.create_operator(
+            "unsqueeze" + self._accumulated_op_id(),
+            "Unsqueeze").set_input("x", x).set_attr_vec_int32("axes", axes)
+        shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", output)
+        return [shape, output], [[1], [0]]
+
+
+## parallel
+class AllGatherParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(AllGatherParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_allgather"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        rank_size = self.op.attr("rank_size")
+        group = self.op.attr("group")
+
+        allgather = core.GEOperatorFactory.create_operator(
+            "allgather" + self._accumulated_op_id(), "HcomAllGather").set_input(
+                "x", x).set_attr_int32(
+                    "rank_size", rank_size).set_attr_string("group", group)
+        return [allgather], [[0]]
+
+
+class AllReduceParser(AscendParserBase):
+    def __init__(self, graph, var2geop, reduction):
+        super(AllReduceParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_allreduce_" + reduction
+        self.reduction = reduction
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        reduction = self.reduction
+        ring_id = self.op.attr("ring_id")
+        group = "hcom_group_" + str(ring_id)
+        fusion = None  #self.op.attr("fusion")
+        fusion_id = None  #self.op.attr("fusion_id")
+
+        allreduce = core.GEOperatorFactory.create_operator(
+            "allreduce" + self._accumulated_op_id(), "HcomAllReduce").set_input(
+                "x", x).set_attr_string(
+                    "reduction", reduction).set_attr_string("group", group)
+        if fusion is not None:
+            allreduce.set_attr_int32("fusion", fusion)
+
+        if fusion_id is not None:
+            allreduce.set_attr_int32("fusion_id", fusion_id)
+        return [allreduce], [[0]]
+
+
+class AllReduceSumParser(AllReduceParser):
+    def __init__(self, graph, var2geop):
+        super(AllReduceSumParser, self).__init__(graph, var2geop, 'sum')
+
+
+class AllReduceMaxParser(AllReduceParser):
+    def __init__(self, graph, var2geop):
+        super(AllReduceMaxParser, self).__init__(graph, var2geop, 'max')
+
+
+class BroadcastParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(BroadcastParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_broadcast"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        root_rank = self.op.attr("root_rank")
+        group = self.op.attr("group")
+
+        broadcast = core.GEOperatorFactory.create_operator(
+            "broadcast" + self._accumulated_op_id(), "HcomBroadcast").set_input(
+                "x", x).set_attr_int32(
+                    "root_rank", root_rank).set_attr_string("group", group)
+        return [broadcast], [[0]]
+
+
+class ReduceScatterParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReduceScatterParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_reduce_scatter"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        reduction = self.op.attr("reduction")
+        group = self.op.attr("group")
+        rank_size = self.op.attr("rank_size")
+
+        reduce_scatter = core.GEOperatorFactory.create_operator(
+            "reducescatter" + self._accumulated_op_id(),
+            "HcomReduceScatter").set_input("x", x).set_attr_string(
+                "reduction", reduction).set_attr_string(
+                    "group", group).set_attr_int32("rank_size", rank_size)
+        return [reduce_scatter], [[0]]
+
+
+class SendParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SendParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_send"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        sr_tag = self.op.attr("sr_tag")
+        dest_rank = self.op.attr("dest_rank")
+        group = self.op.attr("group")
+
+        send = core.GEOperatorFactory.create_operator(
+            "send" + self._accumulated_op_id(), "HcomSend").set_input(
+                "x", x).set_attr_int32("sr_tag", sr_tag).set_attr_int32(
+                    "dest_rank", dest_rank).set_attr_string("group", group)
+        return [send], [[0]]
+
+
+class ReceiveParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReceiveParser, self).__init__(graph, var2geop)
+        self.parser_name = "c_receive"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        sr_tag = self.op.attr("sr_tag")
+        src_rank = self.op.attr("src_rank")
+        group = self.op.attr("group")
+        shape = self.op.attr("shape")
+        dtype = self.op.attr("dtype")
+
+        receive = core.GEOperatorFactory.create_operator(
+            "receive" + self._accumulated_op_id(), "HcomReceive").set_input(
+                "x", x).set_attr_int32("sr_tag", sr_tag).set_attr_int32(
+                    "src_rank", src_rank).set_attr_string(
+                        "group", group).set_attr_vec_int32(
+                            "shape", shape).set_attr_int32("dtype", dtype)
+        return [receive], [[0]]
+
+
+class RangeParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(RangeParser, self).__init__(graph, var2geop)
+        self.parser_name = "range"
+
+    def _apply(self):
+        # TODO not support range type yet
+        start = self._get_ge_input(self.op.input_arg_names[0])
+        end = self._get_ge_input(self.op.input_arg_names[1])
+        delta = self._get_ge_input(self.op.input_arg_names[2])
+
+        ge_range = core.GEOperatorFactory.create_operator(
+            "range" + self._accumulated_op_id(), "Range")\
+              .set_input("start", end)\
+              .set_input("limit", start) \
+              .set_input("delta", delta)
+
+        return [ge_range], [[0]]
+
+
+class UniformRandomParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(UniformRandomParser, self).__init__(graph, var2geop)
+        self.parser_name = "uniform_random"
+
+    def _apply(self):
+        shape = self.op.attr("shape")
+
+        min_v = self.op.attr("min")
+        max_v = self.op.attr("max")
+        seed = self.op.attr("seed")
+        dtype = self.op.attr("dtype")
+        assert max_v > min_v, "assert max_v > min_v, but recieved " + \
+               "as max_v={}, min_v={} ".format(max_v, min_v)
+
+        tensor1 = self._create_ge_tensor([len(shape)], 2, shape)
+        shape_tensor = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor1)
+
+        ge_ur = core.GEOperatorFactory.create_operator(
+            "uniform_random" + self._accumulated_op_id(), "RandomUniform")\
+            .set_input("shape", shape_tensor)\
+            .set_attr_dtype("dtype", self.ascend_helper.dtype2ge(dtype))  \
+            .set_attr_int32("seed", seed)\
+            .set_attr_int32("seed2", seed)
+
+        scale = max_v - min_v
+
+        scale_value = core.GEOperatorFactory.create_operator(
+            "scale" + self._accumulated_op_id(), "Power").set_input(
+                "x", ge_ur).set_attr_float("power", 1.0).set_attr_float(
+                    "scale", scale).set_attr_float("shift", min_v)
+
+        return [scale_value], [[0]]
+
+
+class EqualParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(EqualParser, self).__init__(graph, var2geop)
+        self.parser_name = "equal"
+
+    def _apply(self):
+        data_x1 = self._get_ge_input(self.op.input_arg_names[0])
+        data_x2 = self._get_ge_input(self.op.input_arg_names[1])
+        equal = core.GEOperatorFactory.create_operator("equal" \
+           + self._accumulated_op_id(), "Equal")\
+             .set_input("x1", data_x1)\
+             .set_input("x2", data_x2)
+        return [equal], [[0]]
+
+
+class ExpandParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ExpandParser, self).__init__(graph, var2geop)
+        self.parser_name = "expand"
+
+    def _apply(self):
+        data_x1_shape = self._get_ge_input(self.op.input_arg_names[0])
+        expand_times = self.op.attr('expand_times')
+
+        tensor = self._create_ge_tensor([len(expand_times)], 2, expand_times)
+        expand_tensor = core.GEOperatorFactory.\
+           create_operator("const" + self._accumulated_op_id(), "Const")\
+              .set_attr_tensor("value", tensor)
+
+        assign = core.GEOperatorFactory\
+           .create_operator("tile" + self._accumulated_op_id(), "Tile")\
+              .set_input("x", data_x1_shape)\
+              .set_input("multiples", expand_tensor)
+        return [assign], [[0]]
+
+
+class SqueezeParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SqueezeParser, self).__init__(graph, var2geop)
+        self.parser_name = "squeeze2"
+
+    def _apply(self):
+        tensor = self._get_ge_input(self.op.input_arg_names[0])
+        axes = self.op.attr("axes")
+
+        data_squeezed = core.GEOperatorFactory\
+           .create_operator("squeeze" + self._accumulated_op_id(), "Squeeze")\
+             .set_input("x", tensor)\
+             .set_attr_vec_int32("axes", axes)
+        shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(),
+            "Shape").set_input("x", data_squeezed)
+        return [shape, data_squeezed], [[1], [0]]
+
+
+#****************************************************************#
+#***************************            *************************#
+#***************************            *************************#
+#*************************** GradParser *************************#
+#***************************            *************************#
+#***************************            *************************#
+#****************************************************************#
+## grad
+class ReduceSumGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReduceSumGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "reduce_sum_grad"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        input = self._get_ge_input(self.op.input_arg_names[1])
+
+        shape_tensor = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(),
+            "Shape").set_input("x", input, 0)
+        tensoron = self._create_ge_tensor([1], 2, -1)
+        const = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensoron)
+        self._mark_as_input(const)
+
+        reduce_sum = core.GEOperatorFactory.create_operator(
+            "broadcast_to_d" + self._accumulated_op_id(),
+            "BroadcastTo").set_input("x", x).set_input("shape", shape_tensor)
+        #reduce_sum = core.GEOperatorFactory.create_operator("expand" + self._accumulated_op_id(), "ExpandDims").set_input("x", reduce_sum).set_input("axis", const)
+
+        return [reduce_sum], [[0]]
+
+
+class MatMulGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MatMulGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "matmul_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        y = self._get_ge_input(self.op.input_arg_names[2])
+        transpose_x = self.op.attr("transpose_X")
+        transpose_y = self.op.attr("transpose_Y")
+
+        out_grad_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        x_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+        y_shape = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        if len(x_shape) > 2:
+            if transpose_y:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "BatchMatMul").set_input("x1", out_grad).set_input(
+                        "x2", y).set_attr_bool(
+                            "adj_x1", False).set_attr_bool("adj_x2", False)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "BatchMatMul").set_input("x1", out_grad).set_input(
+                        "x2", x).set_attr_bool(
+                            "adj_x1", True).set_attr_bool("adj_x2", False)
+            else:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "BatchMatMul").set_input("x1", out_grad).set_input(
+                        "x2", y).set_attr_bool(
+                            "adj_x1", False).set_attr_bool("adj_x2", True)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "BatchMatMul").set_input("x1", x).set_input(
+                        "x2", out_grad).set_attr_bool(
+                            "adj_x1", True).set_attr_bool("adj_x2", False)
+        else:
+            if transpose_y:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", out_grad).set_input(
+                        "x2", y).set_attr_bool(
+                            "transpose_x1", False).set_attr_bool("transpose_x2",
+                                                                 False)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", out_grad).set_input(
+                        "x2", x).set_attr_bool(
+                            "transpose_x1", True).set_attr_bool("transpose_x2",
+                                                                False)
+            else:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", out_grad).set_input(
+                        "x2", y).set_attr_bool(
+                            "transpose_x1", False).set_attr_bool("transpose_x2",
+                                                                 True)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", x).set_input(
+                        "x2", out_grad).set_attr_bool(
+                            "transpose_x1", True).set_attr_bool("transpose_x2",
+                                                                False)
+
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class MulGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MulGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "mul_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        y = self._get_ge_input(self.op.input_arg_names[2])
+        x_num_col_dims = self.op.attr("x_num_col_dims")
+        y_num_col_dims = self.op.attr("y_num_col_dims")
+
+        shape_out_grad = self.op.block.var(self.op.input_arg_names[0]).shape
+        shape_x = self.op.block.var(self.op.input_arg_names[1]).shape
+        shape_y = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        if x_num_col_dims == 1 and y_num_col_dims == 1:
+            if len(shape_x) == 2 and len(shape_y) == 2:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", out_grad).set_input(
+                        "x2", y).set_attr_bool(
+                            "transpose_x1", False).set_attr_bool("transpose_x2",
+                                                                 True)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", x).set_input(
+                        "x2", out_grad).set_attr_bool(
+                            "transpose_x1", True).set_attr_bool("transpose_x2",
+                                                                False)
+            elif len(shape_x) == 3 and len(shape_y) == 2:
+                flatten_x = core.GEOperatorFactory.create_operator(
+                    "flatten" + self._accumulated_op_id(),
+                    "Flatten").set_input("x", x)
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input(
+                        "x1", out_grad).set_input("x2", y).set_attr_bool(
+                            "transpose_x1",
+                            False).set_attr_bool("transpose_x2", True)
+                if len(shape_out_grad) == 2:
+                    x_grad = core.GEOperatorFactory.create_operator(
+                        "unsqueeze" + self._accumulated_op_id(),
+                        "Unsqueeze").set_input("x", x_grad).set_attr_vec_int32(
+                            "axes", [1])
+
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input(
+                        "x1",
+                        flatten_x).set_input("x2", out_grad).set_attr_bool(
+                            "transpose_x1",
+                            True).set_attr_bool("transpose_x2", False)
+        else:
+            if len(shape_x) == 3 and len(shape_y) == 2:
+                assert x_num_col_dims == 2, "only support 2"
+                flatten_x = core.GEOperatorFactory.create_operator(
+                    "flatten" + self._accumulated_op_id(),
+                    "FlattenV2").set_input("x", x).set_attr_int32(
+                        "axis", 0).set_attr_int32("end_axis", 1)
+                flatten_out_grad = core.GEOperatorFactory.create_operator(
+                    "flatten" + self._accumulated_op_id(),
+                    "FlattenV2").set_input("x", out_grad).set_attr_int32(
+                        "axis", 0).set_attr_int32("end_axis", 1)
+
+                y_unsqueeze = core.GEOperatorFactory.create_operator(
+                    "unsqueeze" + self._accumulated_op_id(),
+                    "Unsqueeze").set_input("x",
+                                           y).set_attr_vec_int32("axes", [0])
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "BatchMatMul").set_input("x1", out_grad).set_input(
+                        "x2", y_unsqueeze).set_attr_bool(
+                            "adj_x1", False).set_attr_bool("adj_x2", True)
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "MatMul").set_input("x1", flatten_x).set_input(
+                        "x2", flatten_out_grad).set_attr_bool(
+                            "transpose_x1",
+                            True).set_attr_bool("transpose_x2", False)
+
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class ReluGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReluGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "relu_grad"
+
+    def _apply(self):
+        out = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+        relu_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(), "ReluGrad").set_input(
+                "gradients", out_grad).set_input("features", out)
+        return [relu_grad], [[0]]
+
+
+class SoftmaxWithCrossEntropyGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SoftmaxWithCrossEntropyGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "softmax_with_cross_entropy_grad"
+
+    def _apply(self):
+        label = self._get_ge_input(self.op.input_arg_names[0])
+        loss_grad = self._get_ge_input(self.op.input_arg_names[1])
+        softmax = self._get_ge_input(self.op.input_arg_names[2])
+        cls_num = self.op.block.var(self.op.input_arg_names[2]).shape[1]
+
+        label_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        loss_grad_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+        softmax_shape = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        tensoron = self._create_ge_tensor([1], 5, 1)
+        on = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensoron)
+        tensoroff = self._create_ge_tensor([1], 5, 0)
+        off = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensoroff)
+        self._mark_as_input(on)
+        self._mark_as_input(off)
+
+        label = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", label).set_attr_int32("dst_type", 3)
+        onehot = core.GEOperatorFactory.create_operator(
+            "onehot" + self._accumulated_op_id(), "OneHotD").set_input(
+                "x", label).set_input("on_value", on).set_input(
+                    "off_value", off).set_attr_int32("depth", cls_num)
+        squeeze = core.GEOperatorFactory.create_operator(
+            "suqeeze" + self._accumulated_op_id(),
+            "Squeeze").set_input("x", onehot)
+        sub = core.GEOperatorFactory.create_operator(
+            "sub" + self._accumulated_op_id(), "Sub").set_input(
+                "x1", softmax).set_input("x2", squeeze)
+        grad = core.GEOperatorFactory.create_operator(
+            "mul" + self._accumulated_op_id(),
+            "Mul").set_input("x1", loss_grad).set_input("x2", sub)
+
+        return [on, off, label, onehot, grad], [[-1]]
+
+
+class DotMulGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(DotMulGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_mul_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        out_1 = self._get_ge_input(self.op.input_arg_names[1])
+        out_2 = self._get_ge_input(self.op.input_arg_names[2])
+
+        x_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(),
+            "Mul").set_input("x1", out_grad).set_input("x2", out_2)
+        y_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(),
+            "Mul").set_input("x1", out_1).set_input("x2", out_grad)
+
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class DotAddGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(DotAddGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_add_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        out_1 = self._get_ge_input(self.op.input_arg_names[1])
+        out_2 = self._get_ge_input(self.op.input_arg_names[2])
+        out_grad_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        out_1_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+        out_2_shape = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        x_grad = out_grad
+        cur_time_x = len(out_grad_shape) - len(out_1_shape)
+        for i in range(cur_time_x):
+            x_grad = core.GEOperatorFactory.create_operator(
+                self.parser_name + self._accumulated_op_id(),
+                "ReduceSumD").set_input("x", x_grad).set_attr_vec_int32(
+                    "axes", [0]).set_attr_bool("keep_dims", False)
+        for axis, size in enumerate(out_1_shape):
+            if size == 1:
+                x_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "ReduceSumD").set_input("x", x_grad).set_attr_vec_int32(
+                        "axes", [axis]).set_attr_bool("keep_dims", True)
+
+        y_grad = out_grad
+        cur_time_y = len(out_grad_shape) - len(out_2_shape)
+        for i in range(cur_time_y):
+            y_grad = core.GEOperatorFactory.create_operator(
+                self.parser_name + self._accumulated_op_id(),
+                "ReduceSumD").set_input("x", y_grad).set_attr_vec_int32(
+                    "axes", [0]).set_attr_bool("keep_dims", False)
+        for axis, size in enumerate(out_2_shape):
+            if size == 1:
+                y_grad = core.GEOperatorFactory.create_operator(
+                    self.parser_name + self._accumulated_op_id(),
+                    "ReduceSumD").set_input("x", y_grad).set_attr_vec_int32(
+                        "axes", [axis]).set_attr_bool("keep_dims", True)
+
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class DotDivGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(DotDivGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "elementwise_div_grad"
+
+    def _apply(self):
+        out = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+        x = self._get_ge_input(self.op.input_arg_names[2])
+        y = self._get_ge_input(self.op.input_arg_names[3])
+
+        y_power = core.GEOperatorFactory.create_operator(
+            "power" + self._accumulated_op_id(), "Power").set_input(
+                "x", y).set_attr_float("power", -1)
+
+        tensor_zeros = core.GEOperatorFactory.create_operator(
+            "zeroslike" + self._accumulated_op_id(),
+            "ZerosLike").set_input("x", x)
+        x_zero = core.GEOperatorFactory.create_operator(
+            "equal" + self._accumulated_op_id(), "Equal").set_input(
+                "x1", x).set_input("x2", tensor_zeros)
+        x_nozero = core.GEOperatorFactory.create_operator(
+            "logical_not" + self._accumulated_op_id(),
+            "LogicalNot").set_input("x", x_zero)
+        x_nozero_f = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", x_nozero).set_attr_int32("dst_type", 0)
+        x_grad_w = core.GEOperatorFactory.create_operator(
+            "mul" + self._accumulated_op_id(), "Mul").set_input(
+                "x1", x_nozero_f).set_input("x2", y_power)
+        x_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(),
+            "Mul").set_input("x1", x_grad_w).set_input("x2", out_grad)
+
+        y_grad_w = core.GEOperatorFactory.create_operator(
+            "mul" + self._accumulated_op_id(), "Mul").set_input(
+                "x1", out).set_input("x2", y_power)
+        y_grad = core.GEOperatorFactory.create_operator(
+            "mul" + self._accumulated_op_id(), "Mul").set_input(
+                "x1", y_grad_w).set_input("x2", out_grad)
+
+        return [x_grad, y_grad], [[0], [1]]
+
+
+class SoftmaxGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SoftmaxGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "softmax_grad"
+
+    def _apply(self):
+        out = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+
+        x_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(),
+            "SoftmaxGrad").set_input("softmax", out).set_input("grad_softmax",
+                                                               out_grad)
+        return [x_grad], [[0]]
+
+
+class ReshapeGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(ReshapeGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "reshape2_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        x_shape = self._get_ge_input(self.op.input_arg_names[1])
+        x_shape_list = self.op.block.var(self.op.input_arg_names[1]).shape
+
+        if x_shape_list[0] == 0:
+            x_shape_delzero = x_shape_list[1:]
+        tensor = self._create_ge_tensor([len(x_shape_delzero)], 2,
+                                        x_shape_delzero)
+        const_shape = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", tensor)
+        x_grad = core.GEOperatorFactory.create_operator(
+            "reshape" + self._accumulated_op_id(), "Reshape").set_input(
+                "x", out_grad).set_input("shape", const_shape)
+
+        return [x_grad], [[0]]
 
-        return [reshape, reshape], [[0], [1]]
+
+class GatherGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(GatherGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "gather_grad"
+
+    def _apply(self):
+        index = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+        x = self._get_ge_input(self.op.input_arg_names[2])
+
+        index_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        out_grad_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+        x_shape = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        if len(index_shape) == 1:
+            index = core.GEOperatorFactory.create_operator(
+                "unsqueeze" + self._accumulated_op_id(), "Unsqueeze").set_input(
+                    "x", index).set_attr_vec_int32("axes", [1])
+
+        tensor_zeros = core.GEOperatorFactory.create_operator(
+            "zeroslike" + self._accumulated_op_id(),
+            "ZerosLike").set_input("x", x)
+        x_grad = core.GEOperatorFactory.create_operator(
+            "scatter" + self._accumulated_op_id(),
+            "TensorScatterUpdate").set_input("x", tensor_zeros).set_input(
+                "indices", index).set_input("updates", out_grad)
+
+        return [tensor_zeros, x_grad], [[-1]]
+
+
+class TransposeGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(TransposeGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "transpose2_grad"
+
+    def _apply(self):
+        out_grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        perm = self.op.attr("axis")
+
+        x_shape = self.op.block.var(self.op.input_arg_names[1]).shape[1:]
+        out_grad_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        assert list(map(lambda x: out_grad_shape[x], perm)) == list(x_shape)
+
+        x_grad = core.GEOperatorFactory.create_operator(
+            "transpose" + self._accumulated_op_id(), "TransposeD").set_input(
+                "x", out_grad).set_attr_vec_int32("perm", perm)
+
+        return [x_grad], [[0]]
+
+
+class LayerNormGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LayerNormGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "layer_norm_grad"
+
+    def _apply(self):
+        bias = self._get_ge_input(self.op.input_arg_names[0])
+        mean = self._get_ge_input(self.op.input_arg_names[1])
+        scale = self._get_ge_input(self.op.input_arg_names[2])
+        variance = self._get_ge_input(self.op.input_arg_names[3])
+        x = self._get_ge_input(self.op.input_arg_names[4])
+        out_grad = self._get_ge_input(self.op.input_arg_names[5])
+        x_dtype = self.op.block.var(self.op.input_arg_names[4]).dtype
+
+        x_grad = core.GEOperatorFactory.create_operator(
+            self.parser_name + self._accumulated_op_id(),
+            "LayerNormGrad").set_input("dy", out_grad).set_input(
+                "x", x).set_input("variance", variance).set_input(
+                    "mean", mean).set_input("gamma", scale)
+
+        cast_dtype = 0 if self.ascend_helper.dtype2paddle_inv_map[str(
+            x_dtype)] == 0 else 1
+        out_x_grad = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", x_grad, 0).set_attr_int32("dst_type", cast_dtype)
+        out_scale_grad = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", x_grad, 1).set_attr_int32("dst_type", cast_dtype)
+        out_bias_grad = core.GEOperatorFactory.create_operator(
+            "cast" + self._accumulated_op_id(), "Cast").set_input(
+                "x", x_grad, 2).set_attr_int32("dst_type", cast_dtype)
+
+        return [out_x_grad, out_scale_grad, out_bias_grad], [[2], [1], [0]]
+
+
+class TanhGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(TanhGradParser, self).__init__(graph, var2geop)
+        self.parser_name = 'tanh_grad'
+
+    def _apply(self):
+        y = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+        tanh_grad = core.GEOperatorFactory.create_operator(
+            "tanh_grad" + self._accumulated_op_id(),
+            "TanhGrad").set_input("y", y).set_input("dy", out_grad)
+
+        return [tanh_grad], [[0]]
+
+
+class LogGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LogGradParser, self).__init__(graph, var2geop)
+        self.parser_name = 'log_grad'
+
+    def _apply(self):
+        grad = self._get_ge_input(self.op.input_arg_names[0])
+        input = self._get_ge_input(self.op.input_arg_names[1])
+        log_grad = core.GEOperatorFactory.create_operator(
+            "log_grad" + self._accumulated_op_id(),
+            "DivNoNan").set_input("x1", grad).set_input("x2", input)
+        return [log_grad], [[0]]
+
+
+class SqrtGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SqrtGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "sqrt_grad"
+
+    def _apply(self):
+        y = self._get_ge_input(self.op.input_arg_names[0])
+        out_grad = self._get_ge_input(self.op.input_arg_names[1])
+        sqrt_grad = core.GEOperatorFactory.create_operator(
+            "sqrt_grad" + self._accumulated_op_id(),
+            "SqrtGrad").set_input("y", y).set_input("dy", out_grad)
+        return [sqrt_grad]
+
+
+class PowGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(PowGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "pow_grad"
+
+    def _apply(self):
+        grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+        factor = self.op.attr("factor")
+
+        shape_tensor = self._create_shape_tensor()
+        shape_tensor = core.GEOperatorFactory.create_operator(
+            "shape" + self._accumulated_op_id(), "Shape").set_input("x", x)
+        factor_scale = self._create_ge_tensor([1], 5, factor)
+        factor_scale = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(),
+            "Const").set_attr_tensor("value", factor_scale)
+        factor_tensor = core.GEOperatorFactory.create_operator(
+            "broadcast_to_d" + self._accumulated_op_id(),
+            "BroadcastTo").set_input(
+                "x", factor_scale).set_input("shape", shape_tensor)
+
+        x_power = core.GEOperatorFactory.create_operator(
+            "x_power" + self._accumulated_op_id(), "Power").set_input(
+                "x", x).set_attr_float("power", factor - 1)
+        x_power_mul_factor = core.GEOperatorFactory.create_operator(
+            "x_power_mul_factor" + self._accumulated_op_id(), "Mul").set_input(
+                "x1", x).set_input("x2", factor_tensor)
+        x_power_mul_factor_grad = core.GEOperatorFactory.create_operator(
+            "x_power_mul_factor_grad" + self._accumulated_op_id(),
+            "Mul").set_input("x1", x_power_mul_factor).set_input("x2", grad)
+
+        return [x_power_mul_factor_grad], [[0]]
+
+
+class GeluGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(GeluGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "gelu_grad"
+
+    def _apply(self):
+        grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+
+        y = core.GEOperatorFactory.create_operator(
+            "gelu" + self._accumulated_op_id(), "Gelu").set_input("x", x)
+        gelu_grad = core.GEOperatorFactory.create_operator(
+            "gelu_grad" + self._accumulated_op_id(), "GeluGrad").set_input(
+                "x", x).set_input("dy", grad).set_input("y", y)
+
+        return [gelu_grad], [[0]]
+
+
+class MeanGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(MeanGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "mean_grad"
+
+    def _apply(self):
+        grad = self._get_ge_input(self.op.input_arg_names[0])
+        x = self._get_ge_input(self.op.input_arg_names[1])
+
+        ones_tensor = core.GEOperatorFactory.create_operator(
+            "one_tensor" + self._accumulated_op_id(),
+            "OnesLike").set_input("x", x)
+        sum = core.GEOperatorFactory.create_operator(
+            "mean" + self._accumulated_op_id(), "ReduceSumD").set_input(
+                "x", ones_tensor).set_attr_bool(
+                    "keep_dims", False).set_attr_vec_int32("axes", [])
+        mean = core.GEOperatorFactory.create_operator(
+            "x_power" + self._accumulated_op_id(), "Power").set_input(
+                "x", sum).set_attr_float("power", -1)
+
+        mean_grad = core.GEOperatorFactory.create_operator(
+            "mean_grad" + self._accumulated_op_id(),
+            "Mul").set_input("x1", mean).set_input("x2", grad)
+
+        return [mean_grad], [[0]]
+
+
+class SliceGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SliceGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "slice_grad"
+
+    def _apply(self):
+        x = self._get_ge_input(self.op.input_arg_names[0])
+        grad = self._get_ge_input(self.op.input_arg_names[1])
+        axes = self.op.attr("axes")
+        starts = self.op.attr("starts")
+        ends = self.op.attr("ends")
+
+        x_shape = self.op.block.var(self.op.input_arg_names[0]).shape
+        grad_shape = self.op.block.var(self.op.input_arg_names[1]).shape
+
+        len_shape = len(x_shape)
+        axes_cor = list(range(len_shape))
+        starts_cor, ends_cor = [], []
+        cnt = 0
+        for i in range(len_shape):
+            starts_cor.append(starts[cnt] if i in axes else 0)
+            if i in axes and ends[cnt] <= x_shape[i]:
+                ends_cor.append(x_shape[i] - ends[cnt])
+            else:
+                ends_cor.append(0)
+            if i in axes:
+                cnt += 1
+
+        starts_cor[0] = 0
+        ends_cor[0] = 0
+        paddings = [[s, e] for (s, e) in zip(starts_cor, ends_cor)]
+        slice_value = core.GEOperatorFactory.create_operator(
+            "slice_grad" + self._accumulated_op_id(), "PadD").set_input(
+                "x", grad).set_attr_vec_vec_int64("paddings", paddings)
+
+        return [slice_value], [[0]]
+
+
+class LookUpTableGradParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(LookUpTableGradParser, self).__init__(graph, var2geop)
+        self.parser_name = "lookup_table_grad"
+
+    def _apply(self):
+        ids = self._get_ge_input(self.op.input_arg_names[0])
+        grad = self._get_ge_input(self.op.input_arg_names[1])
+        embedding = self._get_ge_input(self.op.input_arg_names[2])
+
+        shape_ids = self.op.block.var(self.op.input_arg_names[0]).shape
+        shape_grad = self.op.block.var(self.op.input_arg_names[1]).shape
+        shape_embedding = self.op.block.var(self.op.input_arg_names[2]).shape
+
+        ids_flatten = core.GEOperatorFactory.create_operator(
+            "flatten" + self._accumulated_op_id(), "FlattenV2").set_input(
+                "x",
+                ids).set_attr_int32("axis", 0).set_attr_int32("end_axis", 1)
+        grad_flatten = core.GEOperatorFactory.create_operator(
+            "flatten" + self._accumulated_op_id(), "FlattenV2").set_input(
+                "x",
+                grad).set_attr_int32("axis", 0).set_attr_int32("end_axis", 1)
+
+        tensor_zeros = core.GEOperatorFactory.create_operator(
+            "zeroslike" + self._accumulated_op_id(),
+            "ZerosLike").set_input("x", embedding)
+        embedding_grad = core.GEOperatorFactory.create_operator(
+            "scatteradd" + self._accumulated_op_id(),
+            "TensorScatterAdd").set_input(
+                "x", tensor_zeros).set_input("indices", ids_flatten).set_input(
+                    "updates", grad_flatten)
+
+        return [embedding_grad], [[0]]
+
+
+class SGDParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(SGDParser, self).__init__(graph, var2geop)
+        self.parser_name = "sgd"
+
+    def _apply(self):
+        grad = self._get_ge_input(self.op.input_arg_names[0])
+        lr = self._get_ge_input(self.op.input_arg_names[1])
+        param = self._get_ge_input(self.op.input_arg_names[2])
+        sgd = core.GEOperatorFactory.create_operator(
+            "momentum" + self._accumulated_op_id(),
+            "ApplyGradientDescent").set_input("var", param).set_input(
+                "alpha", lr).set_input("delta", grad)
+        return [sgd], [[0]]
+
+
+class AdamParser(AscendParserBase):
+    def __init__(self, graph, var2geop):
+        super(AdamParser, self).__init__(graph, var2geop)
+        self.parser_name = "adam"
+
+    def _apply(self):
+        beta1_power = self._get_ge_input(self.op.input_arg_names[0])
+        beta2_power = self._get_ge_input(self.op.input_arg_names[1])
+        grad = self._get_ge_input(self.op.input_arg_names[2])
+        lr = self._get_ge_input(self.op.input_arg_names[3])
+        moment1 = self._get_ge_input(self.op.input_arg_names[4])
+        moment2 = self._get_ge_input(self.op.input_arg_names[5])
+        param = self._get_ge_input(self.op.input_arg_names[6])
+        beta1 = self.op.attr('beta1')
+        beta2 = self.op.attr('beta2')
+        epsilon = self.op.attr('epsilon')
+
+        beta1 = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", self._create_ge_tensor([1], 5, beta1))
+        beta2 = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", self._create_ge_tensor([1], 5, beta2))
+        epsilon = core.GEOperatorFactory.create_operator(
+            "const" + self._accumulated_op_id(), "Const").set_attr_tensor(
+                "value", self._create_ge_tensor([1], 5, epsilon))
+
+        adam = core.GEOperatorFactory.create_operator(
+            "adam" + self._accumulated_op_id(),
+            "ApplyAdam").set_input("var", param).set_input(
+                "m", moment1).set_input("v", moment2).set_input(
+                    "beta1_power", beta1_power).set_input(
+                        "beta2_power", beta2_power).set_input(
+                            "lr", lr).set_input("beta1", beta1).set_input(
+                                "beta2", beta2).set_input(
+                                    "epsilon", epsilon).set_input("grad", grad)
+
+        return [adam], [[0]]
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index c3d27bcc4ea551..a7f938647ad719 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -106,6 +106,11 @@ def _add_sync_by_allreduce(block):
                     'use_calc_stream': True,
                     OP_ROLE_KEY: OpRole.Forward
                 })
+            block.append_op(
+                type='c_sync_calc_stream',
+                inputs={'X': sync_var},
+                outputs={'Out': sync_var},
+                attrs={OP_ROLE_KEY: OpRole.Forward})
 
         block = program.global_block()
         if core.is_compiled_with_cuda():
diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index 159c0b973b2b72..9a4ffd2fd02d4a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -61,8 +61,9 @@ def _setup_nccl_op(self, startup_program, main_program, build_strategy):
         trainer_endpoints_env = ",".join(trainer_endpoints)
         trainers_num = self.role_maker._worker_num()
 
-        if trainer_id == 0:
-            wait_server_ready(other_trainers)
+        # FIXME(wangxi): approve this.
+        #if trainer_id == 0:
+        #    wait_server_ready(other_trainers)
 
         if core.is_compiled_with_cuda():
             comm_id_var = startup_program.global_block().create_var(
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
old mode 100644
new mode 100755
index 6f435bb86ba5ac..ae2daa9b9d8592
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -45,11 +45,16 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
             'accumulate_steps']
         self.schedule_mode = user_defined_strategy.pipeline_configs[
             'schedule_mode']
+        self.use_sharding = user_defined_strategy.sharding
 
     def _can_apply(self):
         if not self.role_maker._is_collective:
             return False
 
+        # FIXME revise for hybrid parallelism
+        if self.use_sharding:
+            return False
+
         if self.user_defined_strategy.pipeline == True:
             return True
         return False
@@ -171,6 +176,7 @@ def minimize_impl(self,
         program._pipeline_opt['ring_id'] = self.start_pipeline_ring_id
         program._pipeline_opt['micro_batch_size'] = self.micro_batch_size
         program._pipeline_opt['schedule_mode'] = self.schedule_mode
+        program._pipeline_opt['use_sharding'] = False
         optimize_ops, params_grads, prog_list, pp_pair, ring_map = self.wrapped_opt.minimize(
             loss, startup_program, parameter_list, no_grad_set)
         self.startup_program = orig_startup_program._pipeline_opt[
@@ -218,7 +224,6 @@ def _insert_allreduce_ops(self, ring_id):
         grad = None
         processed_param_name = set()
         first_optimize_op_idx = None
-        add_sync_calc_stream = False
         for idx, op in reversed(list(enumerate(block.ops))):
             if is_backward_op(op) and not first_optimize_op_idx:
                 first_optimize_op_idx = idx + 1
@@ -242,15 +247,6 @@ def _insert_allreduce_ops(self, ring_id):
                     origin_param = origin_block.vars[op_role_var[i]]
                     if origin_param.is_distributed:
                         continue
-                    if not add_sync_calc_stream:
-                        add_sync_calc_stream = True
-                        block._insert_op(
-                            first_optimize_op_idx + offset,
-                            type='c_sync_calc_stream',
-                            inputs={'X': grad},
-                            outputs={'Out': grad},
-                            attrs={OP_ROLE_KEY: OpRole.Optimize})
-                        offset += 1
 
                     block._insert_op(
                         first_optimize_op_idx + offset,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
old mode 100644
new mode 100755
index 03b36262a4fb1e..40ba77815663f0
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
@@ -73,7 +73,7 @@ def remove_cast_op(block, params, segment, offset):
     @staticmethod
     def prune_fp16(block, shard, reduced_grads_to_param, ring_id):
         """
-        1. prune all cast_fp32_to_fp16 ops if the param not belongs to this shard
+        1. prune all cast_fp16_to_fp32 ops if the param not belongs to this shard
         2. revise amp inifine grad checking for sharding
         """
         # remove cast
@@ -81,7 +81,10 @@ def prune_fp16(block, shard, reduced_grads_to_param, ring_id):
             if not FP16Utils.is_fp32_cast_op(block, op):
                 continue
             output_name = op.desc.output_arg_names()[0]
-            param_name = output_name.strip("@GRAD")
+            # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+            param_name = output_name.strip(
+                "@GRAD@MERGED"
+            ) if "@MERGED" in output_name else output_name.strip("@GRAD")
             if param_name not in shard.global_params:
                 raise ValueError("Output 'X' of cast_op must be a grad of"
                                  "model param, but {} is not a grad".format(
@@ -103,20 +106,37 @@ def prune_fp16(block, shard, reduced_grads_to_param, ring_id):
                 op._rename_input(inf_var_name, inf_var_name + "@sharding")
             if op.type in ["check_finite_and_unscale", "update_loss_scaling"]:
                 reversed_x = []
+                reversed_x_paramname = []
                 for input_name in op.desc.input('X'):
-                    param_name = input_name.strip("@GRAD")
+                    # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+                    if "@MERGED" in input_name:
+                        param_name = input_name.strip("@GRAD@MERGED")
+                    else:
+                        param_name = input_name.strip("@GRAD")
                     if param_name not in shard.global_params:
                         raise ValueError(
                             "Input 'X' of check_finite_and_unscale must"
                             "be grads, but {} is not a grad".format(input_name))
                     if shard.has_param(param_name):
                         reversed_x.append(input_name)
+                        reversed_x_paramname.append(param_name)
                 op.desc.set_input('X', reversed_x)
                 op.desc.set_output('Out', reversed_x)
+
+                # the grad checking should take the all and only param in the current shard
+                to_check_param = set(reversed_x_paramname)
+                should_check_param = set(shard.global_params).intersection(
+                    set([param for param, worker_idx in shard.global_param2device.items() \
+                        if worker_idx == shard.worker_idx]))
+                assert to_check_param == should_check_param, "amp \
+                    check_finite_and_unscale checking miss [{}] and got unexpected [{}]".format(
+                    should_check_param - to_check_param,
+                    to_check_param - should_check_param)
+
         if update_loss_scaling_op_idx == -1:
             return
         inf_var = block.var(inf_var_name)
-        inf_var_fp32 = block.create_var(
+        inf_var_int32 = block.create_var(
             name=inf_var_name + "@cast_int32",
             shape=inf_var.shape,
             dtype=core.VarDesc.VarType.INT32)
@@ -128,33 +148,86 @@ def prune_fp16(block, shard, reduced_grads_to_param, ring_id):
             update_loss_scaling_op_idx,
             type='cast',
             inputs={'X': inf_var},
-            outputs={'Out': inf_var_fp32},
+            outputs={'Out': inf_var_int32},
             attrs={
                 "in_dtype": inf_var.dtype,
-                "out_dtype": inf_var_fp32.dtype,
+                "out_dtype": inf_var_int32.dtype,
                 OP_ROLE_KEY: OpRole.Optimize
             })
-        insert_sync_calc_op(block, update_loss_scaling_op_idx + 1,
-                            [inf_var_fp32])
+        # this allreduce communication should not overlap with calc
         block._insert_op_without_sync(
-            update_loss_scaling_op_idx + 2,
+            update_loss_scaling_op_idx + 1,
             type='c_allreduce_max',
-            inputs={'X': inf_var_fp32},
-            outputs={'Out': inf_var_fp32},
-            attrs={'ring_id': ring_id,
-                   OP_ROLE_KEY: OpRole.Optimize})
-
-        comm_op_num = insert_sync_comm_op(block, update_loss_scaling_op_idx + 3,
-                                          ring_id, [inf_var_fp32])
-
+            inputs={'X': inf_var_int32},
+            outputs={'Out': inf_var_int32},
+            attrs={
+                'ring_id': ring_id,
+                'use_calc_stream': True,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
         block._insert_op_without_sync(
-            update_loss_scaling_op_idx + 3 + comm_op_num,
+            update_loss_scaling_op_idx + 2,
             type='cast',
-            inputs={'X': inf_var_fp32},
+            inputs={'X': inf_var_int32},
             outputs={'Out': inf_var_sharding},
             attrs={
-                "in_dtype": inf_var_fp32.dtype,
+                "in_dtype": inf_var_int32.dtype,
                 "out_dtype": inf_var_sharding.dtype,
                 OP_ROLE_KEY: OpRole.Optimize
             })
         block._sync_with_cpp()
+
+    # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+    @staticmethod
+    def sync_amp_check_nan_inf(block, ring_id):
+        update_loss_scaling_op_idx = -1
+
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if op.type == "update_loss_scaling":
+                update_loss_scaling_op_idx = idx
+                inf_var_name = op.desc.input('FoundInfinite')[0]
+                op._rename_input(inf_var_name, inf_var_name + "@GLOBAL_WORLD")
+
+        # not use amp
+        if update_loss_scaling_op_idx == -1:
+            return
+        inf_var = block.var(inf_var_name)
+        inf_var_int32 = block.create_var(
+            name=inf_var_name + "@cast_int32",
+            shape=inf_var.shape,
+            dtype=core.VarDesc.VarType.INT32)
+        inf_var_global = block.create_var(
+            name=inf_var_name + "@GLOBAL_WORLD",
+            shape=inf_var.shape,
+            dtype=inf_var.dtype)
+        block._insert_op_without_sync(
+            update_loss_scaling_op_idx,
+            type='cast',
+            inputs={'X': inf_var},
+            outputs={'Out': inf_var_int32},
+            attrs={
+                "in_dtype": inf_var.dtype,
+                "out_dtype": inf_var_int32.dtype,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+        block._insert_op_without_sync(
+            update_loss_scaling_op_idx + 1,
+            type='c_allreduce_max',
+            inputs={'X': inf_var_int32},
+            outputs={'Out': inf_var_int32},
+            attrs={
+                'ring_id': ring_id,
+                'use_calc_stream': True,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+        block._insert_op_without_sync(
+            update_loss_scaling_op_idx + 2,
+            type='cast',
+            inputs={'X': inf_var_int32},
+            outputs={'Out': inf_var_global},
+            attrs={
+                "in_dtype": inf_var_int32.dtype,
+                "out_dtype": inf_var_global.dtype,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+        block._sync_with_cpp()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
old mode 100644
new mode 100755
index c6aee792fcf745..d5a012b147a99e
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
@@ -16,14 +16,14 @@
 
 
 class GradientClipHelper(object):
-    def __init__(self, sharding_ring_id):
-        self.sharding_ring_id = sharding_ring_id
+    def __init__(self, mp_ring_id):
+        self.mp_ring_id = mp_ring_id
 
     def _is_gradient_clip_op(self, op):
         return op.desc.has_attr("op_namescope") \
             and op.desc.attr("op_namescope").startswith("/gradient_clip")
 
-    def prune_gradient_clip(self, block, shard):
+    def prune_gradient_clip(self, block, shard, pure_dp_degree=1):
         """
         prune gradient_clip related ops for params that not belong to cur shard
         prune: square, reduce_sum, elementwise_mul
@@ -31,6 +31,8 @@ def prune_gradient_clip(self, block, shard):
         """
         deperated_vars = set()
         deperate_op_idx = set()
+        reversed_x_paramname = []
+        global_norm_sum_op_idx = -1
         for idx, op in enumerate(block.ops):
             if not self._is_gradient_clip_op(op):
                 continue
@@ -40,15 +42,22 @@ def prune_gradient_clip(self, block, shard):
             for input_name in op.desc.input_arg_names():
                 if input_name in deperated_vars:
                     deperate_op = True
-                param_name = input_name.strip("@GRAD")
+                # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+                if "@MERGED" in input_name:
+                    param_name = input_name.strip("@GRAD@MERGED")
+                else:
+                    param_name = input_name.strip("@GRAD")
                 if shard.is_param(param_name) and \
                   not shard.has_param(param_name):
                     deperate_op = True
+                elif shard.is_param(param_name):
+                    reversed_x_paramname.append(param_name)
 
             if deperate_op:
                 deperate_op_idx.add(idx)
                 for output_name in op.desc.output_arg_names():
-                    deperated_vars.add(output_name)
+                    if output_name not in op.desc.input_arg_names():
+                        deperated_vars.add(output_name)
 
         if not deperated_vars:
             # got no gradient_clip op
@@ -62,36 +71,96 @@ def prune_gradient_clip(self, block, shard):
                 continue
             reversed_inputs = []
             if op.type == "sum":
+                global_norm_sum_op_idx = idx
                 for input_name in op.desc.input_arg_names():
                     if input_name not in deperated_vars:
                         reversed_inputs.append(input_name)
+
                 op.desc.set_input("X", reversed_inputs)
                 assert (len(op.desc.output_arg_names()) == 1)
                 sum_res = op.desc.output_arg_names()[0]
-                block._insert_op_without_sync(
-                    idx + 1,
-                    type='c_sync_comm_stream',
-                    inputs={'X': sum_res},
-                    outputs={'Out': sum_res},
-                    attrs={'ring_id': 0,
-                           OP_ROLE_KEY: OpRole.Optimize})
+
+                # this allreduce should not overlap with calc and should be scheduled in calc stream
                 block._insert_op_without_sync(
                     idx + 1,
                     type='c_allreduce_sum',
                     inputs={'X': sum_res},
                     outputs={'Out': sum_res},
                     attrs={
-                        'ring_id': self.sharding_ring_id,
-                        OP_ROLE_KEY: OpRole.Optimize
+                        'ring_id': self.mp_ring_id,
+                        'op_namescope': "/gradient_clip_model_parallelism",
+                        'use_calc_stream': True,
+                        OP_ROLE_KEY: OpRole.Optimize,
                     })
+
+                # global norm should only be sum within each model parallelism word size when use global group
+                if pure_dp_degree > 1:
+                    block._insert_op_without_sync(
+                        idx + 2,
+                        type='scale',
+                        inputs={'X': sum_res},
+                        outputs={'Out': sum_res},
+                        attrs={
+                            'scale': 1.0 / float(pure_dp_degree),
+                            'op_namescope': "/gradient_clip_model_parallelism",
+                            'bias': 0.0,
+                            'bias_after_scale': False,
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
+
+        # the grad sum here should take the all and only param in the current shard
+        to_check_param = set(reversed_x_paramname)
+        should_check_param = set(shard.global_params).intersection(set(
+            [param for param, worker_idx in shard.global_param2device.items() \
+                if worker_idx == shard.worker_idx]))
+        assert to_check_param == should_check_param, "amp check_finite_and_unscale \
+        checking miss [{}] and got unexpected [{}]".format(
+            should_check_param - to_check_param,
+            to_check_param - should_check_param)
+
+        for var_name in deperated_vars:
+            block._remove_var(var_name, sync=False)
+        block._sync_with_cpp()
+        return
+
+    # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+    def sync_global_norm(self, block, ring_id, pure_dp_degree=1):
+        """
+        prune gradient_clip related ops for params that not belong to cur shard
+        prune: square, reduce_sum, elementwise_mul
+        keep: sum, sqrt, elementwise_max, elementwise_div
+        """
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if not self._is_gradient_clip_op(op):
+                continue
+
+            if op.type == "sum":
+                sum_res = op.desc.output_arg_names()[0]
                 block._insert_op_without_sync(
                     idx + 1,
-                    type='c_sync_calc_stream',
+                    type='c_allreduce_sum',
                     inputs={'X': sum_res},
                     outputs={'Out': sum_res},
-                    attrs={OP_ROLE_KEY: OpRole.Optimize})
+                    attrs={
+                        'ring_id': ring_id,
+                        'op_namescope': "/gradient_clip_model_parallelism",
+                        'use_calc_stream': True,
+                        OP_ROLE_KEY: OpRole.Optimize,
+                    })
+
+                # global norm should only be sum within each model parallelism word size
+                if pure_dp_degree > 1:
+                    block._insert_op_without_sync(
+                        idx + 2,
+                        type='scale',
+                        inputs={'X': sum_res},
+                        outputs={'Out': sum_res},
+                        attrs={
+                            'scale': 1.0 / float(pure_dp_degree),
+                            'op_namescope': "/gradient_clip_model_parallelism",
+                            'bias': 0.0,
+                            'bias_after_scale': False,
+                            OP_ROLE_KEY: OpRole.Optimize
+                        })
 
-        for var_name in deperated_vars:
-            block._remove_var(var_name, sync=False)
-        block._sync_with_cpp()
         return
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
new file mode 100755
index 00000000000000..76803818453c92
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
@@ -0,0 +1,281 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..common import is_optimizer_op, OP_ROLE_KEY, OpRole
+from paddle.fluid import core, unique_name
+
+
+class OffloadHelper(object):
+    cpu_place_type = 0
+    cuda_place_type = 1
+    cuda_pinned_place_type = 2
+
+    def __init__(self):
+        pass
+        "0: dst is on CPUPlace. "
+        "1: dst is on CUDAPlace. "
+        "2: dst is on CUDAPinnedPlace. "
+
+    def _insert_cast_op(self, block, idx, src_name, dst_name):
+        src_var = block.var(src_name)
+        if not block.has_var(dst_name):
+            block.create_var(
+                name=dst_name,
+                shape=src_var.shape,
+                dtype=core.VarDesc.VarType.FP16,
+                persistable=True)
+        dst_var = block.var(dst_name)
+        assert dst_var.dtype == core.VarDesc.VarType.FP16
+        block._insert_op_without_sync(
+            idx,
+            type='cast',
+            inputs={'X': src_var},
+            outputs={'Out': dst_var},
+            attrs={
+                'in_dtype': src_var.dtype,
+                'out_dtype': dst_var.dtype,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+
+    def _insert_memcpy_op(self, block, idx, src_name, dst_name, dst_place_type):
+        src_var = block.var(src_name)
+        dst_var = block.var(dst_name)
+        block._insert_op_without_sync(
+            idx,
+            type='memcpy',
+            inputs={'X': src_var},
+            outputs={'Out': dst_var},
+            attrs={
+                'dst_place_type': dst_place_type,
+                OP_ROLE_KEY: OpRole.Optimize,
+            })
+
+    def _insert_fetch_op(self, block, idx, src_name, dst_name):
+        self._insert_memcpy_op(block, idx, src_name, dst_name,
+                               OffloadHelper.cuda_place_type)
+
+    def _insert_offload_op(self, block, idx, src_name, dst_name):
+        self._insert_memcpy_op(block, idx, src_name, dst_name,
+                               OffloadHelper.cuda_pinned_place_type)
+
+    def _get_offload_var_name(self, name):
+        return unique_name.generate(name + '@offload')
+
+    def _create_offload_var(self, var_name, offload_var_name, blocks):
+        for block in blocks:
+            var = block.var(var_name)
+            var.persistable = False
+            offload_var = block.create_var(
+                name=offload_var_name,
+                shape=var.shape,
+                dtype=var.dtype,
+                persistable=True)
+
+    def offload_fp32param(self, block, startup_block):
+        """
+        (p_fp16) = cast(p)
+        (p_fp16_recompute) = cast(p)
+        (pout,) = adam(p)
+        ===========================>
+        rename(p_fp16_recompute, p_fp16)
+
+        (p,) = prefetch(p@offload)
+        (pout,) = adam(p)
+        (p_fp16) = cast(p)
+        (p@offload) = memcpy(p)
+        """
+        param_to_idx = dict()
+        param_to_fp16 = dict()
+        # recompute_var which need rename to fp16_param
+        fp16_param_to_recompute = dict()
+        recompute_to_fp16 = dict()
+
+        def remove_param(input_name):
+            param_to_idx.pop(input_name)
+            if input_name in param_to_fp16:
+                fp16_param = param_to_fp16.pop(input_name)
+                if fp16_param in fp16_param_to_recompute:
+                    recompute = fp16_param_to_recompute.pop(fp16_param)
+                    recompute_to_fp16.pop(recompute)
+
+        # step1: record param
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if op.type in ('adam', 'momentum', 'lars', 'lamb'):
+                param = op.desc.input("Param")[0]
+                param_to_idx[param] = idx
+
+        # step2: remove param which can't offload
+        for idx, op in enumerate(block.ops):
+            if is_optimizer_op(op):
+                break
+            for input_name in op.desc.input_arg_names():
+                if input_name not in param_to_idx:
+                    continue
+
+                # param is real used by fp32 op
+                if op.type != 'cast':
+                    remove_param(input_name)
+                    continue
+
+                # param is only used by cast op,
+                # which to cast fp32_param to fp16_param
+                output_name = op.output_arg_names[0]
+                if 'cast_fp16' not in output_name:
+                    remove_param(input_name)
+                    continue
+
+                if 'subprog' not in output_name:
+                    assert output_name == input_name + '.cast_fp16'
+                    assert input_name not in param_to_fp16, \
+                        "There must be only one cast op from fp32 param to fp16 param."
+                    param_to_fp16[input_name] = output_name
+                else:
+                    # fp16-->recompute_var
+                    assert input_name in param_to_fp16, \
+                        "param must first be cast to fp16"
+                    fp16_param = param_to_fp16[input_name]
+                    fp16_param_to_recompute[fp16_param] = output_name
+                    recompute_to_fp16[output_name] = fp16_param
+
+        param_name_to_offload_name = dict()
+        # step3: main_block add offload, cast op
+        # change recompute to fp16, remove cast(param) to fp16
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if op.type in ('adam', 'momentum', 'lars', 'lamb'):
+                param = op.desc.input("Param")[0]
+                if param not in param_to_idx: continue
+                # step3.1: create offload_var
+                offload_var_name = self._get_offload_var_name(param)
+                param_name_to_offload_name[param] = offload_var_name
+                self._create_offload_var(param, offload_var_name,
+                                         [block, startup_block])
+
+                # step3.2: insert cast op and offload op
+                self._insert_offload_op(block, idx + 1, param, offload_var_name)
+
+                assert param in param_to_fp16
+                fp16_param_name = param_to_fp16[param]
+                fp16_param_var = block.var(fp16_param_name)
+                fp16_param_var.persistable = True
+                self._insert_cast_op(block, idx + 1, param,
+                                     param_to_fp16[param])
+
+                # step3.3: insert fetch op
+                self._insert_fetch_op(block, idx, offload_var_name, param)
+                continue
+
+            # step3.4: remove cast op
+            if op.type == 'cast':
+                input_name = op.desc.input_arg_names()[0]
+                if input_name in param_to_idx:
+                    block._remove_op(idx, sync=False)
+                    continue
+
+            # step3.5: change recompute_param to fp16_param
+            for input_name in op.desc.input_arg_names():
+                if input_name in recompute_to_fp16:
+                    op._rename_input(input_name, recompute_to_fp16[input_name])
+            for output_name in op.desc.output_arg_names():
+                if output_name in recompute_to_fp16:
+                    op._rename_output(output_name,
+                                      recompute_to_fp16[output_name])
+
+        # step4: remove recompute_param
+        for name in recompute_to_fp16.keys():
+            block._remove_var(name, sync=False)
+
+        # step5: startup_block add offload
+        visited_vars = set()
+        for idx, op in reversed(list(enumerate(startup_block.ops))):
+            for out_name in op.output_arg_names:
+                if out_name in visited_vars:
+                    continue
+
+                if out_name in param_name_to_offload_name:
+                    var_name = out_name
+                    offload_var_name = param_name_to_offload_name[var_name]
+                    self._insert_offload_op(startup_block, idx + 1, var_name,
+                                            offload_var_name)
+                    self._insert_cast_op(startup_block, idx + 1, var_name,
+                                         param_to_fp16[var_name])
+
+                visited_vars.add(out_name)
+
+        block._sync_with_cpp()
+        startup_block._sync_with_cpp()
+
+    def offload(self, block, startup_block):
+        """
+        (m1, m2) = prefetch(m1@offload, m2@offload)
+        (m1out, m2out, pout) = adam(m1, m2, p)
+        (m1@offload, m2@offload) = memcpy(m1, m2)
+        """
+        vars_name_to_offload_name = dict()
+
+        # main_block add offload
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if not is_optimizer_op(op):
+                break
+
+            vars_name = []
+            if op.type == "adam":
+                # {Moment1Out = [''], Moment2Out = [''], ParamOut = ['']} =
+                # adam(inputs={Moment1 = [''], Moment2 = [''], Param = ['']})
+                vars_name.append(op.desc.input("Moment1")[0])
+                vars_name.append(op.desc.input("Moment2")[0])
+            elif op.type == 'momentum':
+                pass
+            elif op.type == 'lars':
+                pass
+            elif op.type == 'lamb':
+                pass
+
+            # step1: create and init offload_var
+            for var_name in vars_name:
+                assert var_name not in vars_name_to_offload_name
+
+                offload_var_name = self._get_offload_var_name(var_name)
+                vars_name_to_offload_name[var_name] = offload_var_name
+
+                self._create_offload_var(var_name, offload_var_name,
+                                         [block, startup_block])
+
+            # step2: insert offload op
+            for var_name in vars_name:
+                offload_var_name = vars_name_to_offload_name[var_name]
+                self._insert_offload_op(block, idx + 1, var_name,
+                                        offload_var_name)
+
+            # step3: insert fetch op
+            for var_name in vars_name:
+                offload_var_name = vars_name_to_offload_name[var_name]
+                self._insert_fetch_op(block, idx, offload_var_name, var_name)
+
+        # startup_block add offload
+        visited_vars = set()
+        for idx, op in reversed(list(enumerate(startup_block.ops))):
+            for out_name in op.output_arg_names:
+                if out_name in visited_vars:
+                    continue
+
+                if out_name in vars_name_to_offload_name:
+                    var_name = out_name
+                    offload_var_name = vars_name_to_offload_name[var_name]
+                    # insert offload op after var is generated
+                    self._insert_offload_op(startup_block, idx + 1, var_name,
+                                            offload_var_name)
+                visited_vars.add(out_name)
+
+        block._sync_with_cpp()
+        startup_block._sync_with_cpp()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
old mode 100644
new mode 100755
index 70753b59ccc318..5a43367cf1ad12
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
@@ -126,6 +126,10 @@ def remove_op(self, op_idx):
 
     def should_remove_op(self, op_idx):
         op = self._block.ops[op_idx]
+        # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+        # remove check_finite_and_unscale op if its input 'X' is empty
+        if op.type == 'check_finite_and_unscale' and len(op.input('X')) == 0:
+            return True
         for output_name in op.desc.output_arg_names():
             if output_name not in self._should_removed_var:
                 return False
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index ad1cd4f60826bb..f4ceb2d287a56c 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -28,21 +28,24 @@ def check_broadcast(block):
     if the broadcasted var has a fill_constant op, the fill_constant
     op should stay forward before the broadcast op, and before a
     sync_calc op. Otherwise, raise error.
+
+    should ignore and skip broadcast_op of inner_parallelism (e.g. Megatron)
     """
     broadcast_vars = {}
     for idx, op in enumerate(block.ops):
         if op.type == "c_broadcast":
-            var_name = op.desc.input_arg_names()[0]
-            if "@BroadCast" in var_name:
-                if var_name in broadcast_vars:
-                    raise ValueError("var_name areadly exist: {}"
-                                     "the old pos is {}, the new pos is {}".
-                                     format(var_name, broadcast_vars[var_name][
-                                         "broadcast_pos"], idx))
-                broadcast_vars[var_name] = {
-                    "fill_constant_pos": -1,
-                    "broadcast_pos": idx,
-                }
+            if op.all_attrs()["use_calc_stream"] == False:
+                var_name = op.desc.input_arg_names()[0]
+                if "@BroadCast" in var_name:
+                    if var_name in broadcast_vars:
+                        raise ValueError("var_name areadly exist: {}"
+                                         "the old pos is {}, the new pos is {}".
+                                         format(var_name, broadcast_vars[
+                                             var_name]["broadcast_pos"], idx))
+                    broadcast_vars[var_name] = {
+                        "fill_constant_pos": -1,
+                        "broadcast_pos": idx,
+                    }
 
     for idx, op in enumerate(block.ops):
         if op.type == "fill_constant":
@@ -61,14 +64,15 @@ def check_broadcast(block):
             last_sync_calc_op_idx = idx
             continue
         if op.type == "c_broadcast":
-            var_name = op.desc.input_arg_names()[0]
-            if "@BroadCast" in var_name:
-                if broadcast_vars[var_name]["fill_constant_pos"] != -1:
-                    assert (last_sync_calc_op_idx != -1)
-                    assert (broadcast_vars[var_name]["fill_constant_pos"] <
-                            last_sync_calc_op_idx)
-                    assert (last_sync_calc_op_idx < idx)
-                continue
+            if op.all_attrs()["use_calc_stream"] == False:
+                var_name = op.desc.input_arg_names()[0]
+                if "@BroadCast" in var_name:
+                    if broadcast_vars[var_name]["fill_constant_pos"] != -1:
+                        assert (last_sync_calc_op_idx != -1)
+                        assert (broadcast_vars[var_name]["fill_constant_pos"] <
+                                last_sync_calc_op_idx)
+                        assert (last_sync_calc_op_idx < idx)
+                    continue
         for input_name in op.desc.input_arg_names():
             if input_name in broadcast_vars:
                 assert (broadcast_vars[input_name]["broadcast_pos"] != -1)
@@ -78,43 +82,48 @@ def check_broadcast(block):
     return
 
 
-def check_allreduce_sum(block, shard, dp_ring_id=-1):
+def check_allreduce_sum(block, shard, sharding_ring_id, dp_ring_id=-1):
     """
     the op order should be:
         grad:
             - 0: op that generate Var
             - 1: sync_calc
-            - 2: allreduce_sum_sharding
+            - 2: reduce_sum_sharding (allreduce --> reduce)
             - 3: sync_comm
             - 4: allreuce_sum_dp (dp_grads)
             - 5: sync_comm (dp_grads)
             - 6: op that use Var (dp_grads & sum)
+
+    should ignore and skip allreduce_op of inner_parallelism (e.g. Megatron)
     """
     vars_status = {}
     dp_grads_status = {}
     idx_last_grad_allreduce = -1
     idx_amp_allreduce = -1
     idx_gradient_clip_allreduce = -1
+
     for idx, op in enumerate(block.ops):
-        if op.type == "c_allreduce_sum":
-            ring_id = op.desc.attr("ring_id")
-            var_name = op.desc.input_arg_names()[0]
-            param = var_name.split("@")[0]
+        # sharding use both allreduce and reduce to sync grad
+        if op.type == "c_allreduce_sum" or op.type == "c_reduce_sum":
+            if op.all_attrs()["use_calc_stream"] == False:
+                ring_id = op.desc.attr("ring_id")
+                var_name = op.desc.input_arg_names()[0]
+                param = var_name.split("@")[0]
 
-            assert 'sum' in var_name or ("@GRAD" in var_name)
-            if 'sum' in var_name or (not shard.has_param(param)):
-                vars_status[var_name] = -1
-            else:
-                dp_grads_status[var_name] = -1
+                assert 'sum' in var_name or ("@GRAD" in var_name)
+                if 'sum' in var_name or (not shard.has_param(param)):
+                    vars_status[var_name] = -1
+                else:
+                    dp_grads_status[var_name] = -1
 
-            if ring_id != 0:
-                assert shard.has_param(param)
-                assert ring_id == dp_ring_id
+                if ring_id != sharding_ring_id:
+                    assert shard.has_param(param)
+                    assert ring_id == dp_ring_id
 
-            if "sum" in var_name:
-                idx_amp_allreduce = idx
-            elif "@GRAD":
-                idx_last_grad_allreduce = idx
+                if "sum" in var_name:
+                    idx_amp_allreduce = idx
+                elif "@GRAD":
+                    idx_last_grad_allreduce = idx
 
         if op.type == "c_allreduce_max":
             idx_gradient_clip_allreduce = idx
@@ -128,38 +137,41 @@ def check_allreduce_sum(block, shard, dp_ring_id=-1):
                 if var_name in dp_grads_status and dp_grads_status[
                         var_name] == 0:
                     dp_grads_status[var_name] = 1
-
-        elif op.type == "c_allreduce_sum":
-            var_name = op.desc.input_arg_names()[0]
-            ring_id = op.desc.attr("ring_id")
-            if ring_id == 0:
-                if var_name in vars_status:
-                    _status = vars_status[var_name]
-                else:
-                    _status = dp_grads_status[var_name]
-                if _status == -1:
-                    raise ValueError("{} is not generated, but you are"
-                                     "trying to all-reduce it".format(var_name))
-                if _status == 0:
-                    raise ValueError("There should be a sync_calc op "
-                                     "after generate Var: {} and before the"
-                                     "c_allreduce_sum op".format(var_name))
-                assert (_status == 1)
-                if var_name in vars_status:
-                    vars_status[var_name] = 2
+        # check sharding allreduce and  reduce but skip megatron allreduce
+        elif op.type == "c_allreduce_sum" or op.type == "c_reduce_sum":
+            if op.all_attrs()["use_calc_stream"] == False:
+                var_name = op.desc.input_arg_names()[0]
+                ring_id = op.desc.attr("ring_id")
+                if ring_id == sharding_ring_id:
+                    assert op.type == "c_reduce_sum", "Grad in Sharding group should be reduce rather than allreduce"
+                    if var_name in vars_status:
+                        _status = vars_status[var_name]
+                    else:
+                        _status = dp_grads_status[var_name]
+                    if _status == -1:
+                        raise ValueError("{} is not generated, but you are"
+                                         "trying to all-reduce it".format(
+                                             var_name))
+                    if _status == 0:
+                        raise ValueError("There should be a sync_calc op "
+                                         "after generate Var: {} and before the"
+                                         "c_allreduce_sum op".format(var_name))
+                    assert (_status == 1)
+                    if var_name in vars_status:
+                        vars_status[var_name] = 2
+                    else:
+                        dp_grads_status[var_name] = 2
                 else:
-                    dp_grads_status[var_name] = 2
-            else:
-                assert ring_id == dp_ring_id
-                param = var_name.split("@")[0]
-                assert shard.has_param(param)
-                assert dp_grads_status[var_name] == 3
-                dp_grads_status[var_name] = 4
+                    assert ring_id == dp_ring_id
+                    param = var_name.split("@")[0]
+                    assert shard.has_param(param)
+                    assert dp_grads_status[var_name] == 3
+                    dp_grads_status[var_name] = 4
 
         elif op.type == "c_sync_comm_stream":
             var_name = op.desc.input_arg_names()[0]
             ring_id = op.desc.attr("ring_id")
-            if ring_id == 0:
+            if ring_id == sharding_ring_id:
                 for var_name in op.desc.input_arg_names():
                     if var_name in vars_status:
                         assert vars_status[var_name] == 2
@@ -181,6 +193,9 @@ def check_allreduce_sum(block, shard, dp_ring_id=-1):
                         raise ValueError("There should be a sync_comm op "
                                          "after allreduce the Var: {}".format(
                                              input_name))
+                    raise ValueError(
+                        "The reduce output grad [{}] should NOT be be used in Non-root rank.".
+                        format(input_name))
                 if input_name in dp_grads_status:
                     if dp_ring_id == -1:
                         if dp_grads_status[input_name] != 3:
@@ -259,6 +274,10 @@ def insert_sync_comm_ops(block, insert_idx, ring_id, comm_dep_vars):
     """
     insert sync_comm_op for vars
     """
+    # NOTE (JZ-LIANG) to be check, may result undefined case 
+    if len(comm_dep_vars) == 0:
+        return 0
+
     op_role = get_valid_op_role(block, insert_idx)
     block._insert_op_without_sync(
         insert_idx,
@@ -309,22 +328,89 @@ def insert_cast_ops(block, insert_idx, cast_ops):
     return
 
 
-def insert_allreduce_ops(block, insert_idx, ring_id, allreduce_vars):
+def insert_allreduce_ops(block,
+                         insert_idx,
+                         ring_id,
+                         allreduce_vars,
+                         op_role=OpRole.Backward,
+                         use_calc_stream=False):
     """
     _add_allreduce_ops
     """
+    if len(allreduce_vars) == 0:
+        return
+
     for var in allreduce_vars:
         block._insert_op_without_sync(
             insert_idx,
             type='c_allreduce_sum',
             inputs={'X': var},
             outputs={'Out': var},
-            attrs={'ring_id': ring_id,
-                   OP_ROLE_KEY: OpRole.Backward})
+            attrs={
+                'ring_id': ring_id,
+                'use_calc_stream': use_calc_stream,
+                OP_ROLE_KEY: op_role
+            })
+
+    return
+
+
+def insert_reduce_ops(block,
+                      insert_idx,
+                      ring_id,
+                      reduce_vars,
+                      shard,
+                      op_role=OpRole.Backward,
+                      use_calc_stream=False):
+    """
+    _add_allreduce_ops
+    """
+    for var in reduce_vars:
 
+        root_id = get_grad_device(var, shard)
+        assert root_id >= 0, "root id should be a positive int".format(var)
+        block._insert_op_without_sync(
+            insert_idx,
+            type='c_reduce_sum',
+            inputs={'X': var},
+            outputs={'Out': var},
+            attrs={
+                'ring_id': ring_id,
+                'root_id': root_id,
+                'use_calc_stream': use_calc_stream,
+                OP_ROLE_KEY: op_role
+            })
     return
 
 
+def get_grad_device(grad_name, shard):
+    assert "@GRAD" in grad_name, "[{}] should be a grad variable.".format(
+        grad_name)
+    base_name = None
+    # mind the traversal order 
+    possible_suffixes = [
+        '.cast_fp16@GRAD@MERGED', '.cast_fp16@GRAD', '@GRAD@MERGED', '@GRAD'
+    ]
+    for suffix in possible_suffixes:
+        if suffix in grad_name:
+            base_name = re.sub(suffix, '', grad_name)
+            break
+
+    assert base_name in shard.global_param2device, "[{}] should be a param variable.".format(
+        base_name)
+
+    return shard.global_param2device[base_name]
+
+
+def get_first_check_finite_and_unscale_op_idx(block):
+
+    for idx, op in enumerate(block.ops):
+        if op.type == "check_finite_and_unscale":
+            return idx
+
+    raise ValueError("check_finite_and_unscale does not exist in block")
+
+
 def insert_broadcast_ops(block, insert_idx, ring_id, broadcast2root):
     """
     _add_broadcast_ops
@@ -384,6 +470,7 @@ def insert_scale_loss_grad_ops(block, scale=1.0):
                 outputs={'Out': loss_grad_var},
                 attrs={'scale': scale,
                        OP_ROLE_KEY: OpRole.Backward})
+            break
 
 
 def comm_analyse(main_program):
@@ -428,7 +515,7 @@ def comm_analyse(main_program):
                                                       count))
 
 
-def add_sync_comm(program, dist_strategy):
+def add_sync_comm(program, sharding_ring_id):
     """
     When clone a test prog by clone from the sharding main prog, 
     part of the sync_comm op maybe be pruned by mistake, this function
@@ -438,6 +525,7 @@ def add_sync_comm(program, dist_strategy):
     #NOTE (liangjianzhong): only support one comm stream by now, use more than one 
     # comm streams will cause error. should be revise in future.
 
+    assert sharding_ring_id >= 0, "sharding_ring_id should larger than zero"
     block = program.global_block()
     not_sync_vars = set([])
     for op in block.ops:
@@ -448,15 +536,14 @@ def add_sync_comm(program, dist_strategy):
             for input_name in op.desc.input_arg_names():
                 not_sync_vars.remove(input_name)
     if not_sync_vars:
-        for nccl_id in range(dist_strategy.nccl_comm_num):
-            block.append_op(
-                type='c_sync_comm_stream',
-                inputs={'X': list(not_sync_vars)},
-                outputs={'Out': list(not_sync_vars)},
-                attrs={
-                    'ring_id': nccl_id,
-                    'op_role': core.op_proto_and_checker_maker.OpRole.Forward
-                })
+        block.append_op(
+            type='c_sync_comm_stream',
+            inputs={'X': list(not_sync_vars)},
+            outputs={'Out': list(not_sync_vars)},
+            attrs={
+                'ring_id': sharding_ring_id,
+                'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+            })
     return
 
 
@@ -466,9 +553,12 @@ def save_persistables(exe, dirname, main_program, filename=None):
     and part of persistable vars are duplicated and exist in all the ranks with different values.
     This function handles the model saving for sharding training.
     """
+    # TODO (JZ-LIANG) revise this for uniform mixed parallelism
+    if main_program._pipeline_opt:
+        main_program = main_program._pipeline_opt['section_program']['program']
 
     def is_opt_vars(var):
-        # NOTE(liangjianzhong): The checks should be updated when add new compatible optimizer
+        # NOTE(JZ-LIANG): The checks should be updated when add new compatible optimizer
         # now only Momentum and adam are compatible with sharding
         checks = [
             "_moment1_0", "_moment2_0", "_beta1_pow_acc_0", "_beta2_pow_acc_0",
@@ -479,12 +569,18 @@ def is_opt_vars(var):
                 return True
         return False
 
+    def is_gradient_merge_vars(var):
+        # NOTE(JZ-LIANG): to revise save/load logic in framework instead of write this naive rule
+
+        return var.name.endswith("@GradiantMerge")
+
     def is_trainable(var):
         return isinstance(var,
                           paddle.fluid.framework.Parameter) and var.trainable
 
     def sharding_predicate(var):
-        return is_trainable(var) or is_opt_vars(var)
+        return is_trainable(var) or is_opt_vars(var) or is_gradient_merge_vars(
+            var)
 
     if int(os.environ.get('PADDLE_TRAINER_ID', 0)) == 0:
         paddle.fluid.io.save_persistables(
@@ -498,3 +594,42 @@ def sharding_predicate(var):
             filename=None)
 
     return
+
+
+def get_grad_device(grad_name, shard):
+    assert "@GRAD" in grad_name, "[{}] should be a grad variable.".format(
+        grad_name)
+    base_name = None
+    # mind the traversal order 
+    possible_suffixes = ['.cast_fp16@GRAD', '@GRAD']
+    for suffix in possible_suffixes:
+        if suffix in grad_name:
+            base_name = re.sub(suffix, '', grad_name)
+            break
+
+    assert base_name in shard.global_param2device, "[{}] should be a param variable.".format(
+        base_name)
+
+    return shard.global_param2device[base_name]
+
+
+def append_naive_sync(block, sync_var, ring_id):
+    # NOTE (JZ-LIANG) update this to use barrier sync for more elegent logic
+    # sync within global 
+    block.append_op(
+        type="fill_constant",
+        outputs={"Out": sync_var},
+        attrs={
+            "shape": sync_var.shape,
+            "dtype": sync_var.dtype,
+            "value": int(1),
+        })
+    block.append_op(
+        type='c_allreduce_sum',
+        inputs={'X': sync_var},
+        outputs={'Out': sync_var},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': True,
+            OP_ROLE_KEY: OpRole.Forward
+        })
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index a7f704361d31af..a83ae226a9df1e 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -12,25 +12,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 from paddle.fluid import unique_name, core
 import paddle.fluid as fluid
-
 from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_VAR_KEY, CollectiveHelper
-from paddle.distributed.fleet.meta_optimizers.common import is_backward_op
+from paddle.distributed.fleet.meta_optimizers.common import is_backward_op, is_optimizer_op, is_update_op
 from paddle.distributed.fleet.meta_optimizers.meta_optimizer_base import MetaOptimizerBase
 from paddle.distributed.fleet.meta_optimizers.sharding.shard import Shard, ProgramSegment
 from paddle.distributed.fleet.meta_optimizers.sharding.fp16_helper import FP16Utils
 from paddle.distributed.fleet.meta_optimizers.sharding.weight_decay_helper import WeightDecayHelper
 from paddle.distributed.fleet.meta_optimizers.sharding.gradient_clip_helper import GradientClipHelper
+from .sharding.offload_helper import OffloadHelper
 from paddle.distributed.fleet.meta_optimizers.sharding.prune import ProgramDeps
 from paddle.distributed.fleet.meta_optimizers.sharding.utils import *
+from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
+from paddle.fluid import layers
+
 import logging
+logging.basicConfig(
+    format='%(asctime)s %(levelname)-8s %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S')
 from functools import reduce
 
 __all__ = ["ShardingOptimizer"]
 
 
 class ShardingOptimizer(MetaOptimizerBase):
+    """Sharding Optimizer."""
+
     def __init__(self, optimizer):
         super(ShardingOptimizer, self).__init__(optimizer)
         self.inner_opt = optimizer
@@ -39,6 +48,8 @@ def __init__(self, optimizer):
             "AMPOptimizer",
             "LarsOptimizer",
             "LambOptimizer",
+            # "ModelParallelOptimizer",
+            # "PipelineOptimizer",
         ]
         self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
         self._main_program = None
@@ -50,6 +61,10 @@ def __init__(self, optimizer):
         # reduced grads to param name
         self._reduced_grads_to_param = {}
         self._shard = Shard()
+        self._verbose = False
+
+        # use sharding as outer parallelism (e.g. inner:Megatron & outer sharding)
+        self.mp_degree = 1
 
     def _can_apply(self):
         if not self.role_maker._is_collective:
@@ -64,7 +79,7 @@ def _disable_strategy(self, dist_strategy):
 
     def _enable_strategy(self, dist_strategy, context):
         dist_strategy.sharding = True
-        dist_strategy.sharding_configs = {"fuse_broadcast_MB": 32}
+        dist_strategy.sharding_configs = {"segment_broadcast_MB": 32}
 
     def minimize_impl(self,
                       loss,
@@ -75,104 +90,469 @@ def minimize_impl(self,
         # self._nrings = self.user_defined_strategy.nccl_comm_num
         self._nrings_sharding = 1
         self._nrings_dp = 1
-        self._fuse_broadcast_MB = self.user_defined_strategy.sharding_configs[
-            "fuse_broadcast_MB"]
+
+        # segment
+        self._sharding_segment_strategy = str(
+            self.user_defined_strategy.sharding_configs[
+                "sharding_segment_strategy"])
+        if self._sharding_segment_strategy == "segment_broadcast_MB":
+            self._broadcast_MB = self.user_defined_strategy.sharding_configs[
+                "segment_broadcast_MB"]
+            assert self._broadcast_MB > 0, "segment size should larger than zero !"
+        elif self._sharding_segment_strategy == "segment_anchors":
+            self._sharding_segment_anchors = self.user_defined_strategy.sharding_configs[
+                "segment_anchors"]
+            assert len(self._sharding_segment_anchors
+                       ) > 0, "you should set the sharding segment anchors !"
+            self._backward_remain_anchors = self._sharding_segment_anchors[:]
+            self._forward_remain_anchors = []
+        else:
+            raise NotImplementedError(
+                "the sharding segment strategy [{}] is not implemented".format(
+                    str(self._sharding_segment_strategy)))
+
+        # parallelism
+        self.sharding_degree = int(self.user_defined_strategy.sharding_configs[
+            "sharding_degree"])
+        assert self.sharding_degree > 0, "sharding degree must be larger than zero"
+        self.mp_degree = int(self.user_defined_strategy.sharding_configs[
+            "mp_degree"])
+        # pipeline setting
+        # TODO (JZ-LIANG) should revise here for support mix parallelism with pipeline
+        self.pp_degree = int(self.user_defined_strategy.sharding_configs[
+            "pp_degree"])
+        if self.pp_degree > 1:
+            assert self.user_defined_strategy.pipeline == True
+
+        self.dp_degree = int(self.user_defined_strategy.sharding_configs[
+            'dp_degree'])
+        assert self.role_maker._worker_num(
+        ) == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "global work size [{}], mp_degree [{}], sharding_degree [{}], pp_degree [{}], dp_degree [{}].".format(
+            self.role_maker._worker_num(),
+            self.mp_degree,
+            self.sharding_degree,
+            self.pp_degree,
+            self.dp_degree, )
+
         self.hybrid_dp = self.user_defined_strategy.sharding_configs[
             "hybrid_dp"]
+        # NOTE (JZ-LIANG) 
+        # there 2 kind of modes for gradient-merge and hybrid-dp in mixed parallism [sharding] and [pipeline].
+        # we distinguish this two modes since the gm/hybrid-dp related allreduce should be insert in different place according different mode to have best performance:
+        # sharding: communication within node, and therefore should insert within backward segment to overlap with bw calc, conduct every micro step 
+        # pipeline: communication accross nodes, and therefore should insert in update segemnt, conduct just once per global step        
+        self.hybrid_dp_mode = None
+        # dp here is the pure dp as the outest parallelism
+        if self.hybrid_dp:
+            assert self.dp_degree > 1, "hybrid dp is on, but dp degree is [{}]".format(
+                self.dp_degree)
+            if self.pp_degree > 1:
+                self.hybrid_dp_mode = "pp_hybrid_dp"
+            else:
+                assert self.sharding_degree > 1, "by now we only support five kind of hybrid dp: sharding_hybrid_dp, mp_sharding_hybrid_dp, pp_hybrid_dp, mp_sharding_pp_hybrid_dp, sharding_pp_hybrid_dp."
+                self.hybrid_dp_mode = "sharding_hybrid_dp"
+
+        # gradient merge
+        self._gradient_merge_acc_step = int(
+            self.user_defined_strategy.sharding_configs[
+                "gradient_merge_acc_step"])
+        self.gradient_merge_mode = None
+        if self.pp_degree <= 1:
+            self.gradient_merge_mode = "sharding_gm"
+            self._grad2merged_grad = dict()
+        else:
+            self.gradient_merge_mode = "pp_gm"
+            self._gradient_merge_acc_step = self.user_defined_strategy.pipeline_configs[
+                'accumulate_steps']
+        if self._gradient_merge_acc_step > 1:
+            logging.info("Gradient merge in [{}], acc step = [{}]".format(
+                self.gradient_merge_mode, self._gradient_merge_acc_step))
+
+        # optimize offload
+        self.optimize_offload = self.user_defined_strategy.sharding_configs[
+            "optimize_offload"]
+
+        # this feature is design for ascend, and should NOT be used in GPU training
+        self.pp_allreduce_in_optimize = self.user_defined_strategy.sharding_configs[
+            "pp_allreduce_in_optimize"]
 
         if self.inner_opt is None:
             raise ValueError(
                 "self.inner_opt of ShardingOptimizer should not be None.")
-        optimize_ops, params_grads = self.inner_opt.minimize(
-            loss, startup_program, parameter_list, no_grad_set)
+
+        if self.pp_degree > 1:
+            pp_optimizer = fluid.optimizer.PipelineOptimizer(
+                self.inner_opt, self._gradient_merge_acc_step)
+            main_program = loss.block.program
+            main_program._pipeline_opt = dict()
+            self.schedule_mode = self.user_defined_strategy.pipeline_configs[
+                'schedule_mode']
+            main_program._pipeline_opt['schedule_mode'] = self.schedule_mode
+            main_program._pipeline_opt[
+                'micro_batch_size'] = self.user_defined_strategy.pipeline_configs[
+                    'micro_batch_size']
+            self.pp_rank_ = self.role_maker._worker_index() // (
+                self.sharding_degree * self.mp_degree) % self.pp_degree
+            main_program._pipeline_opt['local_rank'] = self.pp_rank_
+            main_program._pipeline_opt[
+                'global_rank'] = self.role_maker._worker_index()
+            main_program._pipeline_opt['use_sharding'] = True
+            # TODO (JZ-LIANG) should revise here for support mix parallelism with pipeline
+            main_program._pipeline_opt['ring_id'] = 20
+            main_program._pipeline_opt['global_ring_id'] = 3
+
+            optimize_ops, params_grads, program_list, self.pipeline_pair, self.pp_ring_map = pp_optimizer.minimize(
+                loss, startup_program, parameter_list, no_grad_set)
+            self.pp_degree = len(program_list)
+        else:
+            optimize_ops, params_grads = self.inner_opt.minimize(
+                loss, startup_program, parameter_list, no_grad_set)
 
         if startup_program is None:
             startup_program = default_startup_program()
-        main_block = loss.block
+
+        if self.pp_degree > 1:
+            startup_program = startup_program._pipeline_opt['startup_program']
+            #main_program = main_program._pipeline_opt['section_program']['program']
+            print("pp_rank:", self.pp_rank_)
+            main_program = program_list[self.pp_rank_]
+            with open("main_%d" % self.role_maker._worker_index(), 'w') as f:
+                f.writelines(str(main_program))
+            main_block = main_program.global_block()
+            new_params_grads = []
+            for param, grad in params_grads:
+                if main_block.has_var(param.name):
+                    new_params_grads.append((param, grad))
+            params_grads = new_params_grads
+
+        else:
+            main_block = loss.block
+
         startup_block = startup_program.global_block()
         self._main_program = main_block.program
         self._startup_program = startup_program
 
-        # step1: set_up
-        self._set_up(params_grads)
+        if self.pp_degree > 1:
+            pp_optimizer._rename_gradient_var_name(main_block)
+            with open("main_%d" % self.role_maker._worker_index(), 'w') as f:
+                f.writelines(str(main_program))
+
+        # step0: _init_comm
+        self._init_comm()
 
-        # step2: split_program
-        self._split_program(main_block)
+        if self.sharding_degree > 1:
+
+            # step1: build shard
+            self._build_shard(params_grads)
+
+            # step2: split_program
+            self._split_program(main_block)
+
+            # step3: add broadcast and reduce ops
+            self._add_broadcast_allreduce(main_block)
+            main_block._sync_with_cpp()
+            startup_block._sync_with_cpp()
+
+            main_block._sync_with_cpp()
+
+            # step4: remove unneeded ops and vars from block
+            self._prune_main_program(main_block)
+            self._prune_startup_program(startup_block)
+
+        if self.pp_degree > 1:
+            # sharding-pp related logic
+            # pp_optimizer._rename_gradient_var_name(main_block)
+            # crop ops
+            if self.sharding_degree > 1:
+                for idx, op in reversed(list(enumerate(main_block.ops))):
+                    if is_update_op(op):
+                        op_role_var = op.attr('op_role_var')
+                        param_name = op_role_var[0]
+                        if not self._shard.has_param(param_name):
+                            main_block._remove_op(idx)
+
+                for idx, op in reversed(list(enumerate(main_block.ops))):
+                    if op.type != 'cast': continue
+                    in_name = op.input_arg_names[0]
+                    if in_name not in self._params: continue
+                    #if self._shard.has_param(param_name): continue
+                    if in_name not in main_block.vars:
+                        main_block._remove_op(idx)
+
+            accumulated_grad_names = pp_optimizer._accumulate_gradients(
+                main_block)
+            # accumulated_grad_names = sorted(accumulated_grad_names)
+            if self.pp_allreduce_in_optimize:
+                print("persistable FP32 grad: ")
+                print(accumulated_grad_names)
+                first_optimize_op_index = get_first_check_finite_and_unscale_op_idx(
+                    main_block)
+                insert_reduce_ops(
+                    main_block,
+                    first_optimize_op_index,
+                    self.sharding_ring_id,
+                    accumulated_grad_names,
+                    self._shard,
+                    core.op_proto_and_checker_maker.OpRole.Optimize,
+                    use_calc_stream=True)
+            if self.hybrid_dp and self.hybrid_dp_mode == "pp_hybrid_dp":
+                first_optimize_op_index = get_first_check_finite_and_unscale_op_idx(
+                    main_block)
+                insert_allreduce_ops(
+                    main_block,
+                    first_optimize_op_index,
+                    self.dp_ring_id,
+                    accumulated_grad_names,
+                    core.op_proto_and_checker_maker.OpRole.Optimize,
+                    use_calc_stream=True)
+
+        # if not use sharding, adapt amp/clip, for remain parallelism.
+        # cast --> amp --> clip --> opt
+        if self.sharding_degree <= 1:
+            # amp
+            FP16Utils.sync_amp_check_nan_inf(main_block, self.global_ring_id)
+
+            # clip
+            gradientclip_helper = GradientClipHelper(self.global_ring_id)
+            gradientclip_helper.sync_global_norm(
+                main_block, self.global_ring_id, self.dp_degree)
+
+        # step6: loss div dp_degree 
+        global_dp_degree = self.sharding_degree * self.dp_degree
+        assert int(global_dp_degree) == global_dp_degree
+        if global_dp_degree > 1:
+            insert_scale_loss_grad_ops(main_block, scale=1.0 / global_dp_degree)
 
-        # step3: add broadcast and reduce ops
-        self._add_broadcast_allreduce(main_block)
         main_block._sync_with_cpp()
-        startup_block._sync_with_cpp()
 
-        # step4: insert reduce_sum for grad
-        insert_scale_loss_grad_ops(
-            main_block, scale=1.0 / self.role_maker._worker_num())
-        main_block._sync_with_cpp()
+        # TODO(wangxi): add optimize offload
+        # opt offload should be enable while gradient merge is enable && acc_step is quite large (e.g. >> 100) 
+        # sync its memcpy could not be overlap with calc, otherwise it will slower down training severely. 
+        if self.optimize_offload:
+            logging.info("Sharding with optimize offload !")
+            offload_helper = OffloadHelper()
+            offload_helper.offload(main_block, startup_block)
+            offload_helper.offload_fp32param(main_block, startup_block)
+
+        # step6: (optional) sharding gradient merge
+        if self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
+            self._sharding_gradient_merge(main_block)
+
+        # # check op dependecy
+        # FIXME (JZ-LIANG) enable checking in future.
+        # check_broadcast(main_block)
+        # check_allreduce_sum(main_block, self._shard, self.sharding_ring_id,
+        #                     self.dp_ring_id)
 
-        # step5: remove unneeded ops and vars from block
-        self._prune_main_program(main_block)
-        self._prune_startup_program(startup_block)
+        if self.hybrid_dp:
+            # NOTE(JZ-LIANG) ensure in both sharding_hybrid_dp & pp_hybrid_dp 
+            # init param broadcast should be called after startup pruning             
+            self._initialization_broadcast(startup_block)
+
+        with open("start_sharding_%d" % self.role_maker._worker_index(),
+                  'w') as f:
+            f.writelines(str(startup_block.program))
+        with open("main_sharding_%d" % self.role_maker._worker_index(),
+                  'w') as f:
+            f.writelines(str(main_block.program))
 
-        # check op dependecy
-        check_broadcast(main_block)
-        check_allreduce_sum(main_block, self._shard, self.dp_ring_id)
         self._wait()
+
         return optimize_ops, params_grads
 
-    def _set_up(self, params_grads):
-        # step 1: initialize nccl
-        self.global_word_size = self.role_maker._worker_num()
-        self.global_rank = self.role_maker._worker_index()
-        self.endpoints = self.role_maker._get_trainer_endpoints()
-        self.current_endpoint = self.endpoints[self.global_rank]
-        self._collective_helper = CollectiveHelper(self.role_maker,
-                                                   self._nrings_sharding)
+    def _init_comm(self):
+
         # config sharding & dp groups
-        self._init_comm()
-        # sharding
+        self._build_groups()
+
+        # sync var
+        startup_block = self._startup_program.global_block()
+        self.startup_prog_sync_var = startup_block.create_var(
+            name="startup_prog_sync_var",
+            shape=[1],
+            dtype=core.VarDesc.VarType.INT32,
+            persistable=False)
+
+        # global ring
         self._collective_helper._init_communicator(
-            self._startup_program, self.current_endpoint,
-            self.sharding_group_endpoints, self.sharding_rank,
-            self.sharding_ring_id, True)
-        # dp
-        if self.hybrid_dp:
+            self._startup_program,
+            self.current_endpoint,
+            self.global_endpoints,
+            self.global_rank,
+            self.global_ring_id,
+            False,
+            global_ring_id=self.global_ring_id,
+            sync=False)
+        append_naive_sync(startup_block, self.startup_prog_sync_var,
+                          self.global_ring_id)
+
+        # mp ring
+        if self.mp_degree > 1:
+            self._collective_helper._init_communicator(
+                self._startup_program,
+                self.current_endpoint,
+                self.mp_group_endpoints,
+                self.mp_rank,
+                self.mp_ring_id,
+                False,
+                global_ring_id=self.global_ring_id,
+                sync=False)
+            append_naive_sync(startup_block, self.startup_prog_sync_var,
+                              self.global_ring_id)
+
+        # sharding ring
+        if self.sharding_degree > 1:
             self._collective_helper._init_communicator(
-                self._startup_program, self.current_endpoint,
-                self.dp_group_endpoints, self.dp_rank, self.dp_ring_id, True)
+                self._startup_program,
+                self.current_endpoint,
+                self.sharding_group_endpoints,
+                self.sharding_rank,
+                self.sharding_ring_id,
+                False,
+                global_ring_id=self.global_ring_id,
+                sync=False)
+            append_naive_sync(startup_block, self.startup_prog_sync_var,
+                              self.global_ring_id)
+
+        # pp ring
+        if self.pp_degree > 1:
+            if self.schedule_mode == 'F-then-B':  # GPipe
+                self._collective_helper._init_communicator(
+                    self._startup_program,
+                    self.current_endpoint,
+                    self.pp_group_endpoints,
+                    self.pp_rank,
+                    self.pp_ring_id,
+                    False,
+                    global_ring_id=self.global_ring_id,
+                    sync=False)
+                # append_naive_sync(startup_block, self.startup_prog_sync_var,
+                #                   self.global_ring_id)
+                self._collective_helper._init_communicator(
+                    self._startup_program,
+                    self.current_endpoint,
+                    self.pp_group_endpoints,
+                    self.pp_rank,
+                    self.pp_ring_id + 2,
+                    False,
+                    global_ring_id=self.global_ring_id,
+                    sync=False)
+                # append_naive_sync(startup_block, self.startup_prog_sync_var,
+                #                   self.global_ring_id)
+            else:
+                assert self.schedule_mode == '1F1B'
+                for pair in self.pipeline_pair:
+                    pair_key = pair[0] * 1000 + pair[1]
+                    ring_id = self.pp_ring_map[pair_key]
+                    print("pp pair:{}, ring_id: {}".format(pair, ring_id))
+                    if self.pp_rank not in pair: continue
+                    pp_group_endpoints = [
+                        self.pp_group_endpoints[pair[0]],
+                        self.pp_group_endpoints[pair[1]],
+                    ]
+                    if pair[0] < pair[1]:
+                        start_ring_id = self.pp_ring_id + pair[1] - pair[0] - 1
+                    else:
+                        start_ring_id = self.pp_ring_id + 2 + pair[0] - pair[
+                            1] - 1
+                    pp_rank = 0 if self.pp_rank == pair[0] else 1
+                    self._collective_helper._init_communicator(
+                        self._startup_program,
+                        self.current_endpoint,
+                        pp_group_endpoints,
+                        pp_rank,
+                        ring_id,
+                        False,
+                        global_ring_id=self.global_ring_id,
+                        sync=False)
+                    # append_naive_sync(startup_block, self.startup_prog_sync_var,
+                    #                   self.global_ring_id)
+
+                # TODO (JZ-LIANG) to unify this shit 
+            assert self.pp_rank_ == self.pp_rank, "pp rank for pp opt [{}], pp rank for sharding opt [{}]".format(
+                self.pp_rank_, self.pp_rank)
+
+        # pure dp ring
+        if self.dp_degree > 1:
+            self._collective_helper._init_communicator(
+                self._startup_program,
+                self.current_endpoint,
+                self.dp_group_endpoints,
+                self.dp_rank,
+                self.dp_ring_id,
+                False,
+                global_ring_id=self.global_ring_id,
+                sync=False)
+            append_naive_sync(startup_block, self.startup_prog_sync_var,
+                              self.global_ring_id)
 
-        startup_block = self._startup_program.global_block()
         startup_block._sync_with_cpp()
 
+    def _build_shard(self, params_grads):
         # step 2: split params
         self._params = set([x[0].name for x in params_grads])
         self._shard.setup(params_grads, self.sharding_rank,
-                          self.sharding_group_size)
+                          self.sharding_degree)
 
         # step 3: get broadcast vars
         self._broadcast_vars = self._shard.find_broadcast_params(
             self._main_program.global_block())
 
     def _wait(self, ):
-        endpoints = self.role_maker._get_trainer_endpoints()
-        current_endpoint = endpoints[self.role_maker._worker_index()]
-        if self.role_maker._worker_index() == 0:
+        endpoints = self.global_endpoints[:]
+        current_endpoint = endpoints[self.global_rank]
+        if self.global_rank == 0:
             self._collective_helper._wait(current_endpoint, endpoints)
 
+    def collect_segment(self, segment, op_idx, block):
+        segment._start_idx = op_idx + 1
+        self._segments.insert(0, segment)
+        new_segment = ProgramSegment(block)
+        new_segment._end_idx = op_idx + 1
+
+        return new_segment
+
     def _split_program(self, block):
         for op_idx, op in reversed(list(enumerate(block.ops))):
             if int(op.attr('op_role')) != int(OpRole.Optimize):
                 last_backward_op_idx = op_idx + 1
                 break
+
+        var2broadcast_time = dict()
         segment = ProgramSegment(block)
         segment._end_idx = last_backward_op_idx
         for op_idx in reversed(range(last_backward_op_idx)):
             op = block.ops[op_idx]
             assert (int(op.attr('op_role')) != int(OpRole.Optimize))
-            if segment._param_mem >= self._fuse_broadcast_MB:
-                segment._start_idx = op_idx + 1
-                self._segments.insert(0, segment)
-                segment = ProgramSegment(block)
-                segment._end_idx = op_idx + 1
+            if self._sharding_segment_strategy == "segment_broadcast_MB":
+                if segment._param_mem >= self._broadcast_MB:
+                    segment = self.collect_segment(segment, op_idx, block)
+
+            elif self._sharding_segment_strategy == "segment_anchors":
+                if int(op.attr('op_role')) == int(OpRole.Backward):
+                    for input_name in op.desc.input_arg_names():
+
+                        # NOTE (JZ-LIANG) naive rule to support amp, if amp change, should modify here accordingly
+                        if self.user_defined_strategy.amp:
+                            if ".cast_fp16@GRAD" not in input_name:
+                                continue
+                            else:
+                                input_name = input_name[:input_name.find(
+                                    ".cast_fp16@GRAD")]
+
+                        if input_name in self._backward_remain_anchors:
+                            segment = self.collect_segment(segment, op_idx,
+                                                           block)
+                            assert input_name not in self._forward_remain_anchors, "segment anchor [{}] met twice !".format(
+                                input_name)
+                            self._backward_remain_anchors.remove(input_name)
+                            self._forward_remain_anchors.append(input_name)
+                elif int(op.attr('op_role')) == int(OpRole.Forward):
+                    for output_name in op.desc.output_arg_names():
+                        if output_name in self._forward_remain_anchors:
+                            segment = self.collect_segment(segment, op_idx,
+                                                           block)
+                            self._forward_remain_anchors.remove(output_name)
 
             # find broadcast vars
             for input_name in op.desc.input_arg_names():
@@ -190,6 +570,21 @@ def _split_program(self, block):
                     broadcast_var_name = unique_name.generate(input_name +
                                                               "@BroadCast")
                     segment._fill_constant_vars.append(broadcast_var_name)
+
+                # (JZ-LIANG) should use Param base name ?
+                broadcast_var_base_name = input_name
+                if "subprog" in broadcast_var_base_name:
+                    # remove suffix
+                    broadcast_var_base_name = broadcast_var_base_name[:
+                                                                      broadcast_var_base_name.
+                                                                      find(
+                                                                          ".subprog"
+                                                                      )]
+
+                var2broadcast_time[
+                    broadcast_var_base_name] = var2broadcast_time.get(
+                        broadcast_var_base_name, 0) + 1
+
                 segment._param2broadcast[input_name] = broadcast_var_name
                 segment._broadcast_vars.append((broadcast_var_name,
                                                 self._shard.device(input_name)))
@@ -197,17 +592,22 @@ def _split_program(self, block):
                     self._main_program.global_block().var(input_name))
 
             # find reduce vars
-            if is_backward_op(op) and \
-                    OP_ROLE_VAR_KEY in op.attr_names:
-                op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
-                if len(op_role_var) != 0:
-                    assert len(op_role_var) % 2 == 0
-                    for i in range(0, len(op_role_var), 2):
-                        param, reduced_grad = op_role_var[i], op_role_var[i + 1]
-                        segment._allreduce_vars.append(reduced_grad)
-                        assert (
-                            reduced_grad not in self._reduced_grads_to_param)
-                        self._reduced_grads_to_param[reduced_grad] = param
+            if self.pp_degree > 1 and self.pp_allreduce_in_optimize:
+                # place pipeline gradient allreduce in optimize
+                pass
+            else:
+                if is_backward_op(op) and \
+                        OP_ROLE_VAR_KEY in op.attr_names:
+                    op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
+                    if len(op_role_var) != 0:
+                        assert len(op_role_var) % 2 == 0
+                        for i in range(0, len(op_role_var), 2):
+                            param, reduced_grad = op_role_var[i], op_role_var[
+                                i + 1]
+                            segment._allreduce_vars.append(reduced_grad)
+                            assert (reduced_grad not in
+                                    self._reduced_grads_to_param)
+                            self._reduced_grads_to_param[reduced_grad] = param
 
             # find cast op
             if FP16Utils.is_fp16_cast_op(block, op, self._params):
@@ -219,6 +619,30 @@ def _split_program(self, block):
         if segment._param_mem > 0:
             segment._start_idx = 0
             self._segments.insert(0, segment)
+
+        if self._sharding_segment_strategy == "segment_anchors":
+            assert len(
+                self._forward_remain_anchors) == 0, "remain anchors {}".format(
+                    self._forward_remain_anchors)
+            assert len(
+                self._backward_remain_anchors) == 0, "remain anchors {}".format(
+                    self._backward_remain_anchors)
+
+        if self._verbose:
+            for varname in sorted(
+                    var2broadcast_time, key=var2broadcast_time.get,
+                    reverse=True):
+                logging.info("Sharding broadcast: [{}] times [{}]".format(
+                    var2broadcast_time[varname], varname))
+            for idx_ in range(len(self._segments)):
+                logging.info("segment [{}] :".format(idx_))
+                logging.info("start op: [{}]  [{}]".format(block.ops[
+                    self._segments[idx_]._start_idx].desc.type(), block.ops[
+                        self._segments[idx_]._start_idx].desc.input_arg_names(
+                        )))
+                logging.info("end   op: [{}]  [{}]".format(block.ops[
+                    self._segments[idx_]._end_idx].desc.type(), block.ops[
+                        self._segments[idx_]._end_idx].desc.input_arg_names()))
         return
 
     def _prune_main_program(self, block):
@@ -234,10 +658,21 @@ def _prune_main_program(self, block):
         """
         weightdecay_helper = WeightDecayHelper()
         weightdecay_helper.prune_weight_decay(block, self._shard)
+        # NOTE (JZ-LIANG) the sync of FoundInfinite should among one entire Model Parallelism
+        # group. and each Data Parallelism group should have its own sync of FoundInfinite
+        # amp could use global group for sync
         FP16Utils.prune_fp16(block, self._shard, self._reduced_grads_to_param,
-                             self.sharding_ring_id)
-        gradientclip_helper = GradientClipHelper(self.sharding_ring_id)
-        gradientclip_helper.prune_gradient_clip(block, self._shard)
+                             self.global_ring_id)
+        # clipbyglobalnorm should only use the Model paramllelism group (mp-sharding-pp)
+        if self.mp_degree * self.pp_degree == 1:
+            # separate the sharding-hybrid senario to keep the accuracy
+            gradientclip_helper = GradientClipHelper(self.sharding_ring_id)
+            gradientclip_helper.prune_gradient_clip(
+                block, self._shard, pure_dp_degree=1)
+        else:
+            gradientclip_helper = GradientClipHelper(self.global_ring_id)
+            gradientclip_helper.prune_gradient_clip(
+                block, self._shard, pure_dp_degree=self.dp_degree)
 
         # build prog deps
         reduced_grads = []
@@ -264,8 +699,13 @@ def _prune_main_program(self, block):
         # Prune
         for idx, op in reversed(list(enumerate(block.ops))):
             if op.type in [
-                    "c_allreduce_sum", "c_sync_comm_stream",
-                    "c_calc_comm_stream", "c_gen_nccl_id", "c_comm_init"
+                    "c_allreduce_sum",
+                    "c_sync_comm_stream",
+                    "c_calc_comm_stream",
+                    "c_gen_nccl_id",
+                    "c_comm_init",
+                    'send_v2',
+                    'recv_v2',
             ]:
                 pass
             elif op.type == "conditional_block":
@@ -302,30 +742,76 @@ def _prune_main_program(self, block):
                 if program_deps.should_remove_op(idx):
                     program_deps.remove_op(idx)
 
+        # NOTE (JZ-LIANG) revise and unify logic here
+        # sharding support fp16_allreduce logic            
+        block._sync_with_cpp()
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if op.type == 'concat' and is_optimizer_op(op):
+                # remove inputs that not on this card
+                reserved_x = []
+                for var_name in op.desc.input("X"):
+                    if block.has_var(var_name): reserved_x.append(var_name)
+                op.desc.set_input('X', reserved_x)
         block._sync_with_cpp()
         return
 
     def _add_broadcast_allreduce(self, block):
         """
-        _add_broadcast_allreduce
+        add broadcast allreduce op
+        if enable gradient_merge, insert related ops
+
+        if combined with pipeline(grad accumulate), 
+        the grad allreduce should be done in optimize role
         """
         if len(self._segments) < 1:
             return
         # sharding
+        if self.pp_degree > 1 and self.pp_allreduce_in_optimize:
+            for idx in range(len(self._segments)):
+                assert len(self._segments[idx]._allreduce_vars) == 0
+
+        # NOTE (JZ-LIANG) revise and unify logic here
+        # fix the _end_idx for segments[-1] if pp is used.
+        new_end_idx = self._segments[-1]._end_idx
+        for idx in range(self._segments[-1]._end_idx - 1,
+                         self._segments[-1]._start_idx - 1, -1):
+            op = block.ops[idx]
+            if op.type == "fill_constant" or op.type == "sum":
+                if "MERGED" in op.output_arg_names[0]: new_end_idx = idx + 1
+            elif op.type == "cast":
+                if "@TMP" in op.output_arg_names[0]: new_end_idx = idx + 1
+        self._segments[-1]._end_idx = new_end_idx
+
         if self._segments[-1]._allreduce_vars:
             shard_allredue_vars = self._shard.filter_grads(self._segments[-1]
                                                            ._allreduce_vars)
-            if self.hybrid_dp and len(shard_allredue_vars) >= 1:
-                insert_sync_comm_ops(block, self._segments[-1]._end_idx,
-                                     self.dp_ring_id, shard_allredue_vars)
-                insert_allreduce_ops(block, self._segments[-1]._end_idx,
-                                     self.dp_ring_id, shard_allredue_vars)
+            if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1:
+                if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len(
+                        shard_allredue_vars) >= 1:
+                    insert_sync_comm_ops(block, self._segments[-1]._end_idx,
+                                         self.dp_ring_id, shard_allredue_vars)
+                    insert_allreduce_ops(block, self._segments[-1]._end_idx,
+                                         self.dp_ring_id, shard_allredue_vars)
+            # gradient merge 
+            elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
+                self.create_persistable_gradients_and_insert_merge_ops(
+                    block,
+                    self._startup_program.global_block(),
+                    self._segments[-1]._end_idx, shard_allredue_vars,
+                    self._shard)
+
             insert_sync_comm_ops(block, self._segments[-1]._end_idx,
                                  self.sharding_ring_id,
                                  self._segments[-1]._allreduce_vars)
-            insert_allreduce_ops(block, self._segments[-1]._end_idx,
-                                 self.sharding_ring_id,
-                                 self._segments[-1]._allreduce_vars)
+            # allreduce --> reduce 
+            insert_reduce_ops(
+                block,
+                self._segments[-1]._end_idx,
+                self.sharding_ring_id,
+                self._segments[-1]._allreduce_vars,
+                self._shard,
+                op_role=OpRole.Backward,
+                use_calc_stream=False)
 
         for idx, segment in reversed(list(enumerate(self._segments))):
             allreduce_vars = self._segments[
@@ -364,19 +850,32 @@ def _add_broadcast_allreduce(self, block):
 
             # step2: add Sync ops
             shard_allredue_vars = self._shard.filter_grads(allreduce_vars)
-            if self.hybrid_dp and len(shard_allredue_vars) >= 1:
-                insert_sync_comm_ops(block, segment._end_idx, self.dp_ring_id,
-                                     shard_allredue_vars)
 
+            if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1:
+                if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len(
+                        shard_allredue_vars) >= 1:
+                    insert_sync_comm_ops(block, segment._end_idx,
+                                         self.dp_ring_id, shard_allredue_vars)
+
+                    broad_cast_vars = [x[0] for x in broadcast_vars]
+                    if len(broad_cast_vars) > 0:
+                        insert_sync_comm_ops(block, segment._end_idx,
+                                             self.sharding_ring_id,
+                                             broad_cast_vars)
+                else:
+                    comm_dep_vars = allreduce_vars + [
+                        x[0] for x in broadcast_vars
+                    ]
+                    if len(comm_dep_vars) > 0:
+                        insert_sync_comm_ops(block, segment._end_idx,
+                                             self.sharding_ring_id,
+                                             comm_dep_vars)
+            # gradient merge
+            elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
                 broad_cast_vars = [x[0] for x in broadcast_vars]
                 if len(broad_cast_vars) > 0:
                     insert_sync_comm_ops(block, segment._end_idx,
                                          self.sharding_ring_id, broad_cast_vars)
-            else:
-                comm_dep_vars = allreduce_vars + [x[0] for x in broadcast_vars]
-                if len(comm_dep_vars) > 0:
-                    insert_sync_comm_ops(block, segment._end_idx,
-                                         self.sharding_ring_id, comm_dep_vars)
 
             calc_dep_vars = fill_constant_vars + [
                 k for k, v in cast_ops.items()
@@ -394,18 +893,41 @@ def _add_broadcast_allreduce(self, block):
             insert_cast_ops(block, segment._end_idx, cast_ops)
 
             # step5: add broadcast ops
+            # gradient merge
+            if self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
+                self.create_persistable_gradients_and_insert_merge_ops(
+                    block,
+                    self._startup_program.global_block(), segment._start_idx,
+                    shard_allredue_vars, self._shard)
+
             insert_broadcast_ops(block, segment._start_idx,
                                  self.sharding_ring_id, broadcast_vars)
+
             # step6: add all_reduce ops
             # dp
-            if self.hybrid_dp and len(shard_allredue_vars) >= 1:
-                insert_allreduce_ops(block, segment._start_idx, self.dp_ring_id,
-                                     shard_allredue_vars)
+            if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1:
+                if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len(
+                        shard_allredue_vars) >= 1:
+                    insert_allreduce_ops(block, segment._start_idx,
+                                         self.dp_ring_id, shard_allredue_vars)
+                    insert_sync_comm_ops(block, segment._start_idx,
+                                         self.sharding_ring_id, allreduce_vars)
+            # gradient merge
+            elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1:
                 insert_sync_comm_ops(block, segment._start_idx,
                                      self.sharding_ring_id, allreduce_vars)
             # sharding
-            insert_allreduce_ops(block, segment._start_idx,
-                                 self.sharding_ring_id, allreduce_vars)
+            # allreduce --> reduce 
+            # TODO temp change
+            if len(allreduce_vars) > 0:
+                insert_reduce_ops(
+                    block,
+                    segment._start_idx,
+                    self.sharding_ring_id,
+                    allreduce_vars,
+                    self._shard,
+                    op_role=OpRole.Backward,
+                    use_calc_stream=False)
 
             block._sync_with_cpp()
 
@@ -456,59 +978,472 @@ def _prune_startup_program(self, block):
             block._remove_var(var_name, sync=False)
         block._sync_with_cpp()
 
-    def _init_comm(self):
-
-        if self.hybrid_dp:
-            self.sharding_group_size = self.user_defined_strategy.sharding_configs[
-                "sharding_group_size"]
-            self.sharding_ring_id = 0
-            self.sharding_rank = self.global_rank % self.sharding_group_size
-
-            self.dp_group_size = self.global_word_size // self.sharding_group_size
-            self.dp_rank = self.global_rank // self.sharding_group_size
-            self.dp_ring_id = self.sharding_rank + 1
-
-            self.sharding_group_endpoints = [
-                ep for idx, ep in enumerate(self.endpoints)
-                if (idx // self.sharding_group_size) == self.dp_rank
-            ]
-            self.dp_group_endpoints = [
-                ep for idx, ep in enumerate(self.endpoints)
-                if (idx % self.sharding_group_size) == self.sharding_rank
+    def _build_groups(self):
+        """
+        pre-assign ring ids
+            mp: 0
+            sharding: 1
+            pure-dp: 2
+            global: 3
+            pp: >= 20
+        if one parallelism is not enable: -1
+        and only support parallelism hierarchy: mp --> sharding --> pp --> dp        
+        """
+        # step 1: initialize nccl
+        self.global_word_size = self.role_maker._worker_num()
+        self.global_rank = self.role_maker._worker_index()
+        self.global_endpoints = self.role_maker._get_trainer_endpoints()
+        self.current_endpoint = self.global_endpoints[self.global_rank]
+        self._collective_helper = CollectiveHelper(
+            self.role_maker, nrings=self._nrings_sharding)
+        assert self.global_word_size % self.mp_degree == 0, \
+            "global_word_size: {} should be divisible to the mp_degree: {}".format(self.global_word_size, self.mp_degree)
+        assert self.global_word_size % self.sharding_degree == 0, \
+            "global_word_size: {} should be divisible to the sharding_degree: {}".format(self.global_word_size, self.sharding_degree)
+        assert self.global_word_size % self.pp_degree == 0, \
+            "global_word_size: {} should be divisible to the pp_degree: {}".format(self.global_word_size, self.pp_degree)
+        assert self.global_word_size % self.dp_degree == 0, \
+            "global_word_size: {} should be divisible to the dp_degree: {}".format(self.global_word_size, self.dp_degree)
+
+        # mp group
+        if self.mp_degree > 1:
+            self.mp_ring_id = 0
+            self.mp_rank = self.global_rank % self.mp_degree
+            self.mp_group_id = self.global_rank // self.mp_degree
+            self.mp_group_endpoints = [
+                ep for idx, ep in enumerate(self.global_endpoints)
+                if idx // self.mp_degree == self.mp_group_id
             ]
-            assert self.global_word_size > self.sharding_group_size, \
-                "global_word_size: {} should be larger than sharding_group_size: {}".format(self.global_word_size, self.sharding_group_size)
-            assert self.global_word_size % self.sharding_group_size == 0, \
-                "global_word_size: {} should be divisible to the sharding_group_size: {}".format(self.global_word_size, self.sharding_group_size)
-            assert self.dp_group_size *  self.sharding_group_size == self.global_word_size, \
-                "global_word_size: {} should be equal to the product of sharding_group_size: {} and dp_group_size: {}".format(
-                self.global_word_size,
-                self.sharding_group_size,
-                self.dp_group_size)
-
-            logging.info("Using Sharing&DP mode !")
+            assert self.current_endpoint in self.mp_group_endpoints
+            assert len(
+                self.mp_group_endpoints
+            ) == self.mp_degree, "num of mp worker in group is [{}], but mp group size is [{}]".format(
+                len(self.mp_group_endpoints), self.mp_degree)
+        else:
+            self.mp_degree = 1
+            self.mp_ring_id = -1
+            self.mp_rank = -1
+            self.mp_group_id = -1
+            self.mp_group_endpoints = []
+
+        # sharding 
+        if self.sharding_degree > 1:
+            self.sharding_ring_id = 1
+            self.sharding_rank = (self.global_rank //
+                                  self.mp_degree) % self.sharding_degree
+            self.sharding_group_id = self.global_rank // (self.mp_degree *
+                                                          self.sharding_degree)
+            # mp + sharding + ...
+            if self.mp_degree > 1:
+                self.sharding_group_endpoints = [
+                    ep for idx, ep in enumerate(self.global_endpoints)
+                    if (idx // (self.mp_degree * self.sharding_degree)) == self.
+                    sharding_group_id and idx % self.mp_degree == self.mp_rank
+                ]
+            # sharding + ...    
+            else:
+                self.sharding_group_endpoints = [
+                    ep for idx, ep in enumerate(self.global_endpoints)
+                    if (idx // (self.mp_degree * self.sharding_degree)
+                        ) == self.sharding_group_id
+                ]
+            assert self.current_endpoint in self.sharding_group_endpoints
+        else:
+            self.sharding_degree = 1
+            self.sharding_ring_id = -1
+            self.sharding_rank = -1
+            self.sharding_group_id = -1
+            self.sharding_group_endpoints = []
+
+        # pp
+        if self.pp_degree > 1:
+            self.pp_ring_id = 20
+            self.pp_rank = self.global_rank // (self.sharding_degree *
+                                                self.mp_degree) % self.pp_degree
+            # (NOTE): Already adjust for (outter-pure) dp
+            self.pp_group_id = self.global_rank // (
+                self.mp_degree * self.sharding_degree * self.pp_degree)
+            pp_first_stage_idx = self.global_rank % (
+                self.sharding_degree * self.mp_degree) + self.pp_group_id * (
+                    self.mp_degree * self.sharding_degree * self.pp_degree)
+            pp_stage_offset = self.sharding_degree * self.mp_degree
+            self.pp_group_endpoints = []
+            for i in range(self.pp_degree):
+                self.pp_group_endpoints.append(self.global_endpoints[
+                    pp_first_stage_idx + pp_stage_offset * i])
+            assert self.current_endpoint in self.pp_group_endpoints
+        else:
+            self.pp_degree = 1
+            self.pp_ring_id = -1
+            self.pp_rank = -1
+            self.pp_group_id = -1
+            self.pp_group_endpoints = []
+
+        # outter-pure-dp group
+        # NOTE (JZ-LIANG) support outter-pure-dp to scale the throughput in 3D parallelism
+        # e.g. mp-sharding-pp-dp
+        # sharding-hybrid-dp as one senario of outter-pure-dp 
+        assert self.global_word_size == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "mp_degree: [{}], sharding_degree: [{}], pp_degree: [{}], dp_degree: [{}]; BUT global nrank: [{}]".format(
+            self.mp_degree, self.sharding_degree, self.pp_degree,
+            self.dp_degree, self.global_word_size)
+
+        if self.dp_degree > 1:
+            self.dp_ring_id = 2
+            self.dp_rank = self.global_rank // (self.sharding_degree *
+                                                self.mp_degree * self.pp_degree)
+            dp_first_rank_idx = self.global_rank % (
+                self.sharding_degree * self.mp_degree * self.pp_degree)
+            dp_offset = (self.sharding_degree * self.mp_degree * self.pp_degree)
+            self.dp_group_endpoints = []
+            for i in range(self.dp_degree):
+                self.dp_group_endpoints.append(self.global_endpoints[
+                    dp_first_rank_idx + dp_offset * i])
+            assert self.current_endpoint in self.dp_group_endpoints
+            logging.info("Hybrid DP mode turn on !")
         else:
-            self.sharding_ring_id = 0
-            self.sharding_rank = self.global_rank
-            self.sharding_group_size = self.role_maker._worker_num()
-            self.sharding_group_endpoints = self.endpoints
             self.dp_ring_id = -1
             self.dp_rank = -1
-            self.dp_group_size = None
-            self.dp_group_endpoints = None
+            self.dp_group_endpoints = []
 
-            logging.info("Using Sharing alone mode !")
+        # global group
+        # use for gen_nccl_comm_sync, amp check nan inf, clip by global norm
+        # NOTE (JZ-LIANG) when use global ring for calc global norm and dp_degree > 1, the allreduce result should be devided by dp_degree
+        self.global_ring_id = 3
 
         logging.info("global word size: {}".format(self.global_word_size))
         logging.info("global rank: {}".format(self.global_rank))
-        logging.info("sharding group_size: {}".format(self.sharding_group_size))
+        logging.info("global endpoints: {}".format(self.global_endpoints))
+        logging.info("global ring id: {}".format(self.global_ring_id))
+        logging.info("#####" * 6)
+
+        logging.info("mp group size: {}".format(self.mp_degree))
+        logging.info("mp rank: {}".format(self.mp_rank))
+        logging.info("mp group id: {}".format(self.mp_group_id))
+        logging.info("mp group endpoints: {}".format(self.mp_group_endpoints))
+        logging.info("mp ring id: {}".format(self.mp_ring_id))
+        logging.info("#####" * 6)
+
+        logging.info("sharding group size: {}".format(self.sharding_degree))
         logging.info("sharding rank: {}".format(self.sharding_rank))
-        logging.info("dp group size: {}".format(self.dp_group_size))
-        logging.info("dp rank: {}".format(self.dp_rank))
-        logging.info("current endpoint: {}".format(self.current_endpoint))
+        logging.info("sharding group id: {}".format(self.sharding_group_id))
         logging.info("sharding group endpoints: {}".format(
             self.sharding_group_endpoints))
-        logging.info("dp group endpoints: {}".format(self.dp_group_endpoints))
-        logging.info("global word endpoints: {}".format(self.endpoints))
+        logging.info("sharding ring id: {}".format(self.sharding_ring_id))
+        logging.info("#####" * 6)
+
+        logging.info("pp group size: {}".format(self.pp_degree))
+        logging.info("pp rank: {}".format(self.pp_rank))
+        logging.info("pp group id: {}".format(self.pp_group_id))
+        logging.info("pp group endpoints: {}".format(self.pp_group_endpoints))
+        logging.info("pp ring id: {}".format(self.pp_ring_id))
+        logging.info("#####" * 6)
+
+        logging.info("pure dp group size: {}".format(self.dp_degree))
+        logging.info("pure dp rank: {}".format(self.dp_rank))
+        logging.info("pure dp group endpoints: {}".format(
+            self.dp_group_endpoints))
+        logging.info("pure dp ring id: {}".format(self.dp_ring_id))
+        logging.info("#####" * 6)
 
         return
+
+    def _initialization_broadcast(self, startup_block):
+        """
+        this funtion is to ensure the initialization between dp group to be 
+        identical when hybrid-dp is used.
+        """
+        params = []
+        for param in startup_block.iter_parameters():
+            params.append(param)
+            startup_block.append_op(
+                type='c_broadcast',
+                inputs={'X': param},
+                outputs={'Out': param},
+                attrs={
+                    'ring_id': self.dp_ring_id,
+                    'root': 0,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+        startup_block.append_op(
+            type='c_sync_comm_stream',
+            inputs={'X': params},
+            outputs={'Out': params},
+            attrs={'ring_id': self.dp_ring_id,
+                   OP_ROLE_KEY: OpRole.Forward})
+        # sync within global group
+        append_naive_sync(startup_block, self.startup_prog_sync_var,
+                          self.global_ring_id)
+
+    # sharding gradient merge
+    def create_persistable_gradients_and_insert_merge_ops(
+            self, main_block, startup_block, insert_idx, grad_names, shard):
+
+        for grad_name in grad_names:
+            assert get_grad_device(
+                grad_name, shard
+            ) == shard.worker_idx, "try to merge gradient not belong to current shard: [{}]".format(
+                grad_name)
+            persistable_grad_name = grad_name + '@GradiantMerge'
+            assert grad_name not in self._grad2merged_grad, "grad [{}] already in grad2merged_grad, maybe you meet sharing weight case !".format(
+                grad_name)
+            self._grad2merged_grad[grad_name] = persistable_grad_name
+            grad_var = main_block.var(grad_name)
+            # create var
+            gradient_merge_var = main_block.create_var(
+                name=persistable_grad_name,
+                shape=grad_var.shape,
+                dtype=grad_var.dtype,
+                persistable=True)
+            startup_gradient_merge_var = startup_block.create_var(
+                name=persistable_grad_name,
+                shape=grad_var.shape,
+                dtype=grad_var.dtype,
+                persistable=True)
+
+            # merge gradient
+            main_block._insert_op_without_sync(
+                insert_idx,
+                type="elementwise_add",
+                inputs={'X': grad_name,
+                        'Y': gradient_merge_var},
+                outputs={'Out': gradient_merge_var},
+                attrs={
+                    'axis': -1,
+                    'use_mkldnn': False,
+                    OP_ROLE_KEY: OpRole.Backward
+                })
+
+            # startup initialization
+            startup_block.append_op(
+                type="fill_constant",
+                outputs={"Out": startup_gradient_merge_var},
+                attrs={
+                    "shape": grad_var.shape,
+                    "dtype": grad_var.dtype,
+                    "value": float(0),
+                })
+
+        main_block._sync_with_cpp()
+        startup_block._sync_with_cpp()
+
+    def _create_gm_cond(self, main_block):
+        # Add const var
+        acc_step_var = layers.create_global_var(
+            name="gradient_merge_acc_step",
+            shape=[1],
+            value=int(self._gradient_merge_acc_step),
+            dtype='int32',
+            persistable=True,
+            force_cpu=True)
+
+        zero_var = layers.create_global_var(
+            name="gradient_merge_zero",
+            shape=[1],
+            value=int(0),
+            dtype='int32',
+            persistable=True,
+            force_cpu=True)
+
+        # Add step var & cond var
+        current_step_var = layers.create_global_var(
+            name="gradient_merge_current_step",
+            shape=[1],
+            value=int(0),
+            dtype='int32',
+            persistable=True,
+            force_cpu=True)
+
+        cond_var = layers.create_global_var(
+            name="gradient_merge_cond",
+            shape=[1],
+            value=bool(0),
+            dtype='bool',
+            persistable=False,
+            force_cpu=True)
+
+        with device_guard("cpu"):
+            # step_var = (step_var + 1) % k_step
+            main_block.append_op(
+                type='increment',
+                inputs={'X': [current_step_var]},
+                outputs={'Out': [current_step_var]},
+                attrs={'step': float(1),
+                       OP_ROLE_KEY: OpRole.Optimize})
+
+            main_block.append_op(
+                type='elementwise_mod',
+                inputs={'X': current_step_var,
+                        'Y': acc_step_var},
+                outputs={'Out': current_step_var},
+                attrs={
+                    'axis': -1,
+                    OP_ROLE_KEY: OpRole.Optimize,
+                    'use_mkldnn': False
+                })
+
+            # cond_var = (step_var == 0)
+            main_block.append_op(
+                type='equal',
+                inputs={'X': current_step_var,
+                        'Y': zero_var},
+                outputs={'Out': cond_var},
+                attrs={OP_ROLE_KEY: OpRole.Optimize})
+        # paddle.static.Print(current_step_var, message="in FWBW last conditional")
+        return cond_var
+
+    def _true_apply_gradient(self):
+        """
+        allreduce grad@gradientmerge in dp group
+        grad@gradientmerge / acc_step
+        re-create all optimize ops of origin main block and rename them
+            cast(backward)
+            amp 
+            clip
+            opt
+        # fill constant grad@gradientmerge
+
+        """
+        # current conditional block
+        main_block = self._main_program.global_block()
+        cur_block_idx = self._main_program.current_block_idx
+        cur_block = self._main_program.current_block()
+        self.cond_block = self._main_program.current_block()
+
+        # cur_block's forward_block & backward_block is itself
+        cur_block._set_forward_block_idx(cur_block_idx)
+
+        # allreduce grad@gradientmerge  
+        if self.hybrid_dp:
+            assert self.dp_ring_id >= 0, "dp_ring_id should larger than 0 when in sharding&DP mode"
+            for grad, merged_grad in self._grad2merged_grad.items():
+                merged_grad_var = main_block.var(merged_grad)
+                cur_block.append_op(
+                    type='c_allreduce_sum',
+                    inputs={'X': merged_grad_var},
+                    outputs={'Out': merged_grad_var},
+                    attrs={
+                        'ring_id': self.dp_ring_id,
+                        'use_calc_stream': True,
+                        OP_ROLE_KEY: OpRole.Optimize
+                    })
+
+        # grad@gradientmerge / acc_step
+        for grad, merged_grad in self._grad2merged_grad.items():
+            # grad /= k_steps
+            merged_grad_var = main_block.var(merged_grad)
+            cur_block.append_op(
+                type='scale',
+                inputs={'X': merged_grad_var},
+                outputs={'Out': merged_grad_var},
+                attrs={
+                    'scale': 1.0 / float(self._gradient_merge_acc_step),
+                    'bias': 0.0,
+                    'bias_after_scale': False,
+                    OP_ROLE_KEY: OpRole.Optimize
+                })
+
+        # re-create optimize ops
+        already_moved_var_names = []
+        for op_desc in self.original_optimize_ops_desc:
+            new_op_desc = cur_block.desc.append_op()
+            new_op_desc.copy_from(op_desc)
+
+            for input_name in new_op_desc.input_arg_names():
+                if input_name in self._grad2merged_grad:
+                    new_op_desc._rename_input(
+                        input_name, self._grad2merged_grad[input_name])
+
+            for output_name in new_op_desc.output_arg_names():
+                if output_name in self._grad2merged_grad:
+                    new_op_desc._rename_output(
+                        output_name, self._grad2merged_grad[output_name])
+
+                # move non temp optimize vars from block0 to cond block
+                if output_name not in already_moved_var_names and output_name not in self._grad2merged_grad.keys(
+                ):
+                    var_ = self._main_program.global_block().var(output_name)
+                    if not var_.persistable:
+                        # move
+                        name_ = var_.name
+                        shape_ = var_.shape
+                        type_ = var_.dtype
+                        self._main_program.global_block()._remove_var(
+                            var_.name, sync=False)
+                        self.cond_block.create_var(
+                            name=name_,
+                            shape=shape_,
+                            dtype=type_,
+                            persistable=False)
+                        already_moved_var_names.append(name_)
+
+        self._main_program.global_block()._sync_with_cpp()
+        cur_block._sync_with_cpp()
+
+        # fill zero to grad@gradientmerge
+        for grad, merged_grad in self._grad2merged_grad.items():
+            merged_grad_var = main_block.var(merged_grad)
+            cur_block.append_op(
+                type='fill_constant',
+                outputs={'Out': merged_grad_var},
+                attrs={
+                    "shape": merged_grad_var.shape,
+                    "dtype": merged_grad_var.dtype,
+                    "value": float(0),
+                    OP_ROLE_KEY: OpRole.Optimize
+                })
+
+        # lr_var = main_block.var("gradient_merge_current_step")
+        # paddle.static.Print(lr_var, message="in OPTIMIZE last conditional")
+
+    def _sharding_gradient_merge(self, main_block):
+        """
+        copy all optimize ops in origin main block
+        remove all optimize ops in origin main block
+        create cond block
+
+        """
+        # copy original optimize ops to temp ops desc list
+        # remove them from block 0
+        tmp_copy_block = self._main_program._create_block()
+
+        self.original_optimize_ops_desc = []
+        for op_idx, op in reversed(list(enumerate(main_block.ops))):
+            if int(op.attr('op_role')) != int(OpRole.Optimize):
+                continue
+            else:
+                tmp_op_desc = tmp_copy_block.desc.append_op()
+                tmp_op_desc.copy_from(op.desc)
+                self.original_optimize_ops_desc.append(tmp_op_desc)
+                main_block._remove_op(op_idx, sync=False)
+        tmp_copy_block._sync_with_cpp()
+        self.original_optimize_ops_desc = list(
+            reversed(self.original_optimize_ops_desc))
+
+        # back to block 0
+        self._main_program._rollback()
+
+        # create cond vars and ops at the end of block 0
+        cond = self._create_gm_cond(main_block)
+
+        # create cond block
+        cond_block = self._main_program._create_block()
+        self._true_apply_gradient()
+
+        # back to block 0
+        self._main_program._rollback()
+
+        # cond op
+        step_scope = self._main_program.global_block().create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+        conditional_block_op = self._main_program.global_block().append_op(
+            type='conditional_block',
+            inputs={
+                'Cond': cond,
+                'Input': [],
+            },
+            outputs={'Out': [],
+                     'Scope': [step_scope]},
+            attrs={
+                'sub_block': cond_block,
+                'is_scalar_condition': True,
+            })
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index a56868060055e8..aa7df57e3c58bd 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -768,7 +768,7 @@ def _init_server(self, dirname=None, var_names=None, **kwargs):
         server = self._get_fleet_proto(is_server=True, is_sync=is_sync)
         proto_txt = str(server)
 
-        debug = bool(os.getenv("PSERVER_DEBUG", "0"))
+        debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
         if debug:
             print("server: \n{}".format(proto_txt))
 
diff --git a/python/paddle/distributed/fleet/utils/ps_util.py b/python/paddle/distributed/fleet/utils/ps_util.py
index a409d02c984cf2..7bf7bec43de008 100644
--- a/python/paddle/distributed/fleet/utils/ps_util.py
+++ b/python/paddle/distributed/fleet/utils/ps_util.py
@@ -16,6 +16,7 @@
 import numpy as np
 import os
 import paddle
+import warnings
 
 
 class DistributedInfer:
@@ -104,8 +105,6 @@ def _init_dense_params(self, exe=None, dirname=None):
                 vars=need_load_vars)
 
     def get_dist_infer_program(self):
-        import paddle.distributed.fleet as fleet
-
         varname2tables = self._get_sparse_table_map()
         convert_program = self._convert_program(self.origin_main_program,
                                                 varname2tables)
@@ -185,6 +184,7 @@ def _pull_sparse_fuse(_program, pull_sparse_ops):
                                 "is_distributed": is_distributed,
                                 "padding_idx": padding_idx,
                                 "table_id": table_id,
+                                "is_test": True,
                                 "lookup_table_version": op_type
                             })
                     else:
@@ -193,6 +193,9 @@ def _pull_sparse_fuse(_program, pull_sparse_ops):
                         )
 
             pull_sparse_ops = _get_pull_sparse_ops(program)
+            warnings.warn(
+                "lookup_table will be forced to test mode when use DistributedInfer"
+            )
             _pull_sparse_fuse(program, pull_sparse_ops)
             return program
 
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index 56e59ac88efee7..bf49604a897e5b 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -303,8 +303,8 @@ def _throw_exception(self, error_index):
                 raise Exception("Process %d terminated with signal %s." %
                                 (error_index, name))
             else:
-                raise Exception("Process %d terminated with exit code %d." & (
-                    error_index, exitcode))
+                raise Exception("Process %d terminated with exit code %d." %
+                                (error_index, exitcode))
 
         original_trace = self.error_queues[error_index].get()
         msg = "\n\n----------------------------------------------\n" \
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
old mode 100644
new mode 100755
index 33e2e387a82758..572ebb26d73cb4
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -115,7 +115,7 @@ def is_amp_cast(op):
         updated_min_idx = min_idx
         while idx_ > pre_segment_end_idx:
             if is_amp_cast(self.ops[idx_]):
-                _logger.debug("found amp-cast op: {}, : {}".format(self.ops[
+                _logger.info("found amp-cast op: {}, : {}".format(self.ops[
                     idx_].desc.type(), self.ops[idx_].desc.input_arg_names()[
                         0]))
                 updated_min_idx = idx_
@@ -155,7 +155,7 @@ def sort_checkpoints(self, checkpoints_name):
         sorted_checkpoints = []
         for name in checkpoints_name:
             if name not in self.var_op_deps:
-                _logger.debug(
+                _logger.info(
                     "Recompute Optimizer: deleted %s from checkpoints, because it is not used in paddle program."
                     % name)
             elif self.var_op_deps[name]["var_as_output_ops"] == []:
@@ -233,6 +233,8 @@ def _add_needed_descs_to_block(descs, block, main_block, in_memory_vars):
             new_op_desc = block.desc.append_op()
             new_op_desc.copy_from(desc)
             new_op_desc._set_attr(op_role_attr_name, backward)
+            if desc.has_attr('op_device'):
+                new_op_desc._set_attr('op_device', desc.attr('op_device'))
             result_descs.append(new_op_desc)
     return result_descs
 
@@ -252,6 +254,8 @@ def _add_descs_to_block(descs, block):
         new_op_desc = block.desc.append_op()
         new_op_desc.copy_from(desc)
         new_op_desc._set_attr(op_role_attr_name, backward)
+        if desc.has_attr('op_device'):
+            new_op_desc._set_attr('op_device', desc.attr('op_device'))
         result_descs.append(new_op_desc)
     return result_descs
 
@@ -784,7 +788,6 @@ def _append_backward_ops_with_checkpoints_(
         start_idx = 0
         pre_segment_end_idx = -1
         while True:
-            _logger.debug("FW op range[0] - [{}]".format(len(ops)))
             if start_idx >= len(checkpoints_name) - 1:
                 break
             # min_idx: checkpoint_1' s input op
@@ -797,6 +800,9 @@ def _append_backward_ops_with_checkpoints_(
                 min_idx = program_stat._update_segment_start(
                     min_idx, pre_segment_end_idx)
                 segments.append([min_idx, max_idx + 1])
+            else:
+                _logger.info("Could not recompute op range [{}] - [{}] ".format(
+                    min_idx, max_idx + 1))
 
             start_idx += 1
 
@@ -806,15 +812,15 @@ def _append_backward_ops_with_checkpoints_(
         recompute_segments = segments
 
     for i, (idx1, idx2) in enumerate(recompute_segments):
-        _logger.debug("recompute segment[{}]".format(i))
-        _logger.debug("segment start op: [{}]: [{}]".format(ops[idx1].desc.type(
+        _logger.info("recompute segment[{}]".format(i))
+        _logger.info("segment start op: [{}]: [{}]".format(ops[idx1].desc.type(
         ), ops[idx1].desc.input_arg_names()))
-        _logger.debug("segment end op: [{}]: [{}]".format(ops[
+        _logger.info("segment end op: [{}]: [{}]".format(ops[
             idx2 - 1].desc.type(), ops[idx2 - 1].desc.input_arg_names()))
-        _logger.debug("recompute segment[{}]".format(i))
-        _logger.debug("segment start op: [{}]: [{}]".format(ops[idx1].desc.type(
+        _logger.info("recompute segment[{}]".format(i))
+        _logger.info("segment start op: [{}]: [{}]".format(ops[idx1].desc.type(
         ), ops[idx1].desc.input_arg_names()))
-        _logger.debug("segment end op: [{}]: [{}]".format(ops[
+        _logger.info("segment end op: [{}]: [{}]".format(ops[
             idx2 - 1].desc.type(), ops[idx2 - 1].desc.input_arg_names()))
 
     # 2) go through all forward ops and induct all variables that will be hold in memory
@@ -825,9 +831,7 @@ def _append_backward_ops_with_checkpoints_(
             program_stat.get_out_of_subgraph_vars(segment[0], segment[1]))
 
     cross_vars = set(vars_should_be_hold) - set(checkpoints_name)
-    _logger.debug("found [{}] vars which cross recompute segment: [{}], better checkpoints might be set to reduce those vars".format( \
-    len(cross_vars), cross_vars))
-    _logger.debug("found [{}] vars which cross recompute segment: [{}], better checkpoints might be set to reduce those vars".format( \
+    _logger.info("found [{}] vars which cross recompute segment: [{}], better checkpoints might be set to reduce those vars".format( \
     len(cross_vars), cross_vars))
 
     # b. output of seed op should be kept in memory
@@ -843,6 +847,7 @@ def _append_backward_ops_with_checkpoints_(
     vars_in_memory = vars_should_be_hold + checkpoints_name
 
     max_calculated_op_position = len(ops)
+    device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
     if recompute_segments == []:
         gap_ops = ops[0:max_calculated_op_position]
         for op in reversed(gap_ops):
@@ -852,6 +857,11 @@ def _append_backward_ops_with_checkpoints_(
                                 _pretty_op_desc_(op.desc, "with_sub_block"))
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(no_grad_dict[block.idx]), [])
+            # Set device for grad_op according to forward Op
+            if op.desc.has_attr(device_attr_name):
+                op_device = op.desc.attr(device_attr_name)
+                for op_desc in grad_op_desc:
+                    op_desc._set_attr(device_attr_name, op_device)
             added_descs = _add_descs_to_block(grad_op_desc, local_block)
             grad_op_descs.extend(added_descs)
             grad_to_var.update(op_grad_to_var)
@@ -866,6 +876,11 @@ def _append_backward_ops_with_checkpoints_(
                                 _pretty_op_desc_(op.desc, "with_sub_block"))
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(no_grad_dict[block.idx]), [])
+            # Set device for grad_op according to forward Op
+            if op.desc.has_attr(device_attr_name):
+                op_device = op.desc.attr(device_attr_name)
+                for op_desc in grad_op_desc:
+                    op_desc._set_attr(device_attr_name, op_device)
             added_descs = _add_descs_to_block(grad_op_desc, local_block)
             grad_op_descs.extend(added_descs)
             grad_to_var.update(op_grad_to_var)
@@ -888,6 +903,17 @@ def _append_backward_ops_with_checkpoints_(
                     continue
                 if name not in var_name_dict:
                     var_name_dict[name] = name + var_suffix
+
+                    # we should create the rename var in subprog, otherwise its VarType will be BOOL
+                    ref_var = block.program.global_block().var(name)
+                    block.create_var(
+                        name=var_name_dict[name],
+                        shape=ref_var.shape,
+                        dtype=ref_var.dtype,
+                        type=ref_var.type,
+                        persistable=ref_var.persistable,
+                        stop_gradient=ref_var.stop_gradient)
+
         # 3.a. add ops in current recompute_segment as forward recomputation ops
         buffer_descs = _add_needed_descs_to_block(ff_ops, buffer_block, block,
                                                   vars_in_memory)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index f4620ff00013c8..66b11d1f17ad41 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -468,7 +468,7 @@ def _is_target_layer(self, layer):
         """
         Whether the layer needs to calculate output scales.
         """
-        return isinstance(layer, tuple(utils.quant_output_layers_map.values())) \
+        return isinstance(layer, utils.quant_output_layers) \
             or ('quantized' in layer.full_name() and \
                 'quantized_noweight' not in layer.full_name())
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
index f45eb8c97f419e..004e1c1aa9bc50 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py
@@ -43,28 +43,18 @@
     "fake_quantize_dequantize_moving_average_abs_max"
 ]
 
-quant_output_layers_map = {
-    'Conv2D': paddle.nn.Conv2D,
-    'Conv2DTranspose': paddle.nn.Conv2DTranspose,
-    'Linear': paddle.nn.Linear,
-    'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D,
-    'AdaptiveMaxPool2D': paddle.nn.AdaptiveMaxPool2D,
-    'AvgPool2D': paddle.nn.AvgPool2D,
-    'MaxPool2D': paddle.nn.MaxPool2D,
-    'BatchNorm': paddle.nn.BatchNorm,
-    'BatchNorm2D': paddle.nn.BatchNorm2D,
-    'SyncBatchNorm': paddle.nn.SyncBatchNorm,
-    'ELU': paddle.nn.ELU,
-    'GELU': paddle.nn.GELU,
-    'LeakyReLU': paddle.nn.LeakyReLU,
-    'PReLU': paddle.nn.PReLU,
-    'ReLU': paddle.nn.ReLU,
-    'ReLU6': paddle.nn.ReLU6,
-    'Sigmoid': paddle.nn.Sigmoid,
-    'Softmax': paddle.nn.Softmax,
-    'Tanh': paddle.nn.Tanh,
-    'Swish': paddle.nn.Swish,
-}
+quant_output_layers = (
+    paddle.nn.Conv2D, paddle.nn.Conv2DTranspose, paddle.nn.Linear,
+    paddle.nn.AdaptiveAvgPool2D, paddle.nn.AdaptiveMaxPool2D,
+    paddle.nn.AvgPool2D, paddle.nn.MaxPool2D, paddle.nn.BatchNorm,
+    paddle.nn.BatchNorm2D, paddle.nn.LayerNorm, paddle.nn.SyncBatchNorm,
+    paddle.nn.ELU, paddle.nn.GELU, paddle.nn.Hardshrink, paddle.nn.Hardsigmoid,
+    paddle.nn.Hardswish, paddle.nn.Hardtanh, paddle.nn.LeakyReLU,
+    paddle.nn.LogSigmoid, paddle.nn.LogSoftmax, paddle.nn.Maxout,
+    paddle.nn.PReLU, paddle.nn.ReLU, paddle.nn.ReLU6, paddle.nn.SELU,
+    paddle.nn.Sigmoid, paddle.nn.Softmax, paddle.nn.Softplus,
+    paddle.nn.Softshrink, paddle.nn.Softsign, paddle.nn.Swish, paddle.nn.Tanh,
+    paddle.nn.Tanhshrink, paddle.nn.ThresholdedReLU, paddle.nn.Upsample)
 
 weight_op_types = [
     "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose",
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 4c24eb3d7fcc8e..d3dc26c946df45 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -279,6 +279,7 @@ def to_list(s):
             from .core_avx import _set_process_signal_handler
             from .core_avx import _throw_error_if_process_failed
             from .core_avx import _convert_to_tensor_list
+            from .core_avx import _array_to_share_memory_tensor
             from .core_avx import _cleanup_mmap_fds
             from .core_avx import _remove_tensor_list_mmap_fds
     except Exception as e:
@@ -333,6 +334,7 @@ def to_list(s):
             from .core_noavx import _set_process_signal_handler
             from .core_noavx import _throw_error_if_process_failed
             from .core_noavx import _convert_to_tensor_list
+            from .core_noavx import _array_to_share_memory_tensor
             from .core_noavx import _cleanup_mmap_fds
             from .core_noavx import _remove_tensor_list_mmap_fds
     except Exception as e:
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 0cd12e874d9e36..167c7987c55d30 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -166,7 +166,9 @@ def _thread_loop(self, legacy_expected_place):
                 # pack as LoDTensorArray
                 array = core.LoDTensorArray()
                 for slot in batch:
-                    if not isinstance(slot, core.LoDTensor):
+                    if isinstance(slot, paddle.Tensor):
+                        slot = slot.value().get_tensor()
+                    elif not isinstance(slot, core.LoDTensor):
                         tmp = core.LoDTensor()
                         tmp.set(slot, core.CPUPlace())
                         slot = tmp
@@ -388,7 +390,9 @@ def _thread_loop(self, legacy_expected_place):
                             # LoDTensor not in shared memory is not
                             # serializable, cannot be create in workers
                             for slot in batch:
-                                if not isinstance(slot, core.LoDTensor):
+                                if isinstance(slot, paddle.Tensor):
+                                    slot = slot.value().get_tensor()
+                                elif not isinstance(slot, core.LoDTensor):
                                     tmp = core.LoDTensor()
                                     tmp.set(slot, core.CPUPlace())
                                     slot = tmp
diff --git a/python/paddle/fluid/dataloader/fetcher.py b/python/paddle/fluid/dataloader/fetcher.py
index 9382a704223704..41e12fbc68ec16 100644
--- a/python/paddle/fluid/dataloader/fetcher.py
+++ b/python/paddle/fluid/dataloader/fetcher.py
@@ -27,8 +27,8 @@ def fetch(self, batch_indices):
 
 class _IterableDatasetFetcher(_DatasetFetcher):
     def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
-        super(_IterableDatasetFetcher, self).__init__(dataset, auto_collate_batch,
-                                              collate_fn, drop_last)
+        super(_IterableDatasetFetcher, self).__init__(
+            dataset, auto_collate_batch, collate_fn, drop_last)
         self.dataset_iter = iter(dataset)
 
     def fetch(self, batch_indices):
@@ -53,7 +53,8 @@ def fetch(self, batch_indices):
 
 class _MapDatasetFetcher(_DatasetFetcher):
     def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last):
-        super(_MapDatasetFetcher, self).__init__(dataset, auto_collate_batch, collate_fn, drop_last)
+        super(_MapDatasetFetcher, self).__init__(dataset, auto_collate_batch,
+                                                 collate_fn, drop_last)
 
     def fetch(self, batch_indices):
         if self.auto_collate_batch:
diff --git a/python/paddle/fluid/dataloader/flat.py b/python/paddle/fluid/dataloader/flat.py
index 6cccbc7ee4ea7d..db3a725ece01c2 100644
--- a/python/paddle/fluid/dataloader/flat.py
+++ b/python/paddle/fluid/dataloader/flat.py
@@ -36,14 +36,10 @@ def _flatten_batch(batch):
     def _flatten(batch, flat_batch, structure, field_idx):
         if isinstance(batch, Sequence):
             for field in batch:
-                if isinstance(field, np.ndarray):
+                if isinstance(field, (np.ndarray, paddle.Tensor)):
                     structure.append('{}{}'.format(FIELD_PREFIX, field_idx))
                     flat_batch.append(field)
                     field_idx += 1
-                elif isinstance(field, paddle.Tensor):
-                    structure.append('{}{}'.format(FIELD_PREFIX, field_idx))
-                    flat_batch.append(field.numpy())
-                    field_idx += 1
                 elif isinstance(field, (str, bytes, numbers.Number)):
                     structure.append(field)
                 elif isinstance(field, Sequence):
@@ -58,14 +54,10 @@ def _flatten(batch, flat_batch, structure, field_idx):
                     structure.append(field)
         elif isinstance(batch, Mapping):
             for k, field in batch.items():
-                if isinstance(field, np.ndarray):
+                if isinstance(field, (np.ndarray, paddle.Tensor)):
                     structure[k] = '{}{}'.format(FIELD_PREFIX, field_idx)
                     flat_batch.append(field)
                     field_idx += 1
-                elif isinstance(field, paddle.Tensor):
-                    structure[k] = '{}{}'.format(FIELD_PREFIX, field_idx)
-                    flat_batch.append(field.numpy())
-                    field_idx += 1
                 elif isinstance(field, (str, bytes, numbers.Number)):
                     structure[k] = field
                 elif isinstance(field, Sequence):
diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py
index 2d1b554e53d68c..26bd1f06e12e84 100644
--- a/python/paddle/fluid/dataloader/worker.py
+++ b/python/paddle/fluid/dataloader/worker.py
@@ -238,7 +238,11 @@ def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event,
                     out_queue.put((idx, batch, None))
                 batch, structure = _flatten_batch(batch)
                 if use_shared_memory:
-                    tensor_list = core._convert_to_tensor_list(batch)
+                    tensor_list = [
+                        core._array_to_share_memory_tensor(b)
+                        if isinstance(b, np.ndarray) else b._share_memory()
+                        for b in batch
+                    ]
                     out_queue.put((idx, tensor_list, structure))
                     core._remove_tensor_list_mmap_fds(tensor_list)
                 else:
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 08d58e0c808b83..be5d9ac58311b5 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -26,6 +26,7 @@
 from ..data_feeder import convert_dtype
 import warnings
 from ..framework import _get_paddle_place
+import paddle
 
 __all__ = [
     'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph',
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 4b35d778459703..40ab19184c9c8c 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -509,33 +509,33 @@ def save(layer, path, input_spec=None, **configs):
     Saves input Layer as ``paddle.jit.TranslatedLayer``
     format model, which can be used for inference or fine-tuning after loading.
 
-    It will save the translated program and all related persistable 
+    It will save the translated program and all related persistable
     variables of input Layer to given ``path`` .
-    
-    ``path`` is the prefix of saved objects, and the saved translated program file 
+
+    ``path`` is the prefix of saved objects, and the saved translated program file
     suffix is ``.pdmodel`` , the saved persistable variables file suffix is ``.pdiparams`` ,
-    and here also saved some additional variable description information to a file,  
+    and here also saved some additional variable description information to a file,
     its suffix is ``.pdiparams.info``, these additional information is used in fine-tuning.
 
     The saved model can be loaded by follow APIs:
-      - ``paddle.jit.load`` 
-      - ``paddle.static.load_inference_model`` 
+      - ``paddle.jit.load``
+      - ``paddle.static.load_inference_model``
       - Other C++ inference APIs
 
     Args:
         layer (Layer): The Layer to be saved.
         path (str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``.
-        input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model's forward 
-            method, which can be described by InputSpec or example Tensor. If None, all input variables of 
+        input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model's forward
+            method, which can be described by InputSpec or example Tensor. If None, all input variables of
             the original Layer's forward method would be the inputs of the saved model. Default None.
-        **configs (dict, optional): Other save configuration options for compatibility. We do not 
-            recommend using these configurations, they may be removed in the future. If not necessary, 
+        **configs (dict, optional): Other save configuration options for compatibility. We do not
+            recommend using these configurations, they may be removed in the future. If not necessary,
             DO NOT use them. Default None.
             The following options are currently supported:
             (1) output_spec (list[Tensor]): Selects the output targets of the saved model.
-            By default, all return variables of original Layer's forward method are kept as the 
-            output of the saved model. If the provided ``output_spec`` list is not all output variables, 
-            the saved model will be pruned according to the given ``output_spec`` list. 
+            By default, all return variables of original Layer's forward method are kept as the
+            output of the saved model. If the provided ``output_spec`` list is not all output variables,
+            the saved model will be pruned according to the given ``output_spec`` list.
 
     Returns:
         None
@@ -793,8 +793,8 @@ def load(path, **configs):
     """
     :api_attr: imperative
 
-    Load model saved by ``paddle.jit.save`` or ``paddle.static.save_inference_model`` or 
-    paddle 1.x API ``paddle.fluid.io.save_inference_model`` as ``paddle.jit.TranslatedLayer``, 
+    Load model saved by ``paddle.jit.save`` or ``paddle.static.save_inference_model`` or
+    paddle 1.x API ``paddle.fluid.io.save_inference_model`` as ``paddle.jit.TranslatedLayer``,
     then performing inference or fine-tune training.
 
     .. note::
@@ -807,14 +807,14 @@ def load(path, **configs):
 
     Args:
         path (str): The path prefix to load model. The format is ``dirname/file_prefix`` or ``file_prefix`` .
-        **configs (dict, optional): Other load configuration options for compatibility. We do not 
-            recommend using these configurations, they may be removed in the future. If not necessary, 
+        **configs (dict, optional): Other load configuration options for compatibility. We do not
+            recommend using these configurations, they may be removed in the future. If not necessary,
             DO NOT use them. Default None.
             The following options are currently supported:
-            (1) model_filename (str): The inference model file name of the paddle 1.x 
-            ``save_inference_model`` save format. Default file name is :code:`__model__` . 
-            (2) params_filename (str): The persistable variables file name of the paddle 1.x 
-            ``save_inference_model`` save format. No default file name, save variables separately 
+            (1) model_filename (str): The inference model file name of the paddle 1.x
+            ``save_inference_model`` save format. Default file name is :code:`__model__` .
+            (2) params_filename (str): The persistable variables file name of the paddle 1.x
+            ``save_inference_model`` save format. No default file name, save variables separately
             by default.
 
 
@@ -960,7 +960,7 @@ def __len__(self):
             loader = paddle.io.DataLoader(dataset,
                 feed_list=[image, label],
                 places=place,
-                batch_size=BATCH_SIZE, 
+                batch_size=BATCH_SIZE,
                 shuffle=True,
                 drop_last=True,
                 num_workers=2)
@@ -969,7 +969,7 @@ def __len__(self):
             for data in loader():
                 exe.run(
                     static.default_main_program(),
-                    feed=data, 
+                    feed=data,
                     fetch_list=[avg_loss])
 
             model_path = "fc.example.model"
@@ -1052,7 +1052,7 @@ def _trace(layer,
 class TracedLayer(object):
     """
     :api_attr: imperative
-    
+
     TracedLayer is used to convert a forward dygraph model to a static
     graph model. This is mainly used to save the dygraph model for online
     inference using C++. Besides, users can also do inference in Python
@@ -1132,7 +1132,7 @@ def __init__(self):
                     def forward(self, input):
                         return self._fc(input)
 
-                
+
                 layer = ExampleLayer()
                 in_var = paddle.uniform(shape=[2, 3], dtype='float32')
                 out_dygraph, static_layer = paddle.jit.TracedLayer.trace(layer, inputs=[in_var])
@@ -1244,13 +1244,16 @@ def __call__(self, inputs):
             return self._run(self._build_feed(inputs))
 
     @switch_to_static_graph
-    def save_inference_model(self, dirname, feed=None, fetch=None):
+    def save_inference_model(self, path, feed=None, fetch=None):
         """
         Save the TracedLayer to a model for inference. The saved
         inference model can be loaded by C++ inference APIs.
 
+        ``path`` is the prefix of saved objects, and the saved translated program file
+        suffix is ``.pdmodel`` , the saved persistable variables file suffix is ``.pdiparams`` .
+
         Args:
-            dirname (str): the directory to save the inference model.
+            path(str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``.
             feed (list[int], optional): the input variable indices of the saved
                 inference model. If None, all input variables of the
                 TracedLayer object would be the inputs of the saved inference
@@ -1294,7 +1297,7 @@ def forward(self, input):
                 fetch, = exe.run(program, feed={feed_vars[0]: in_np}, fetch_list=fetch_vars)
                 print(fetch.shape) # (2, 10)
         """
-        check_type(dirname, "dirname", str,
+        check_type(path, "path", str,
                    "fluid.dygraph.jit.TracedLayer.save_inference_model")
         check_type(feed, "feed", (type(None), list),
                    "fluid.dygraph.jit.TracedLayer.save_inference_model")
@@ -1309,6 +1312,18 @@ def forward(self, input):
                 check_type(f, "each element of fetch", int,
                            "fluid.dygraph.jit.TracedLayer.save_inference_model")
 
+        # path check
+        file_prefix = os.path.basename(path)
+        if file_prefix == "":
+            raise ValueError(
+                "The input path MUST be format of dirname/file_prefix "
+                "[dirname\\file_prefix in Windows system], but received "
+                "file_prefix is empty string.")
+
+        dirname = os.path.dirname(path)
+        if dirname and not os.path.exists(dirname):
+            os.makedirs(dirname)
+
         from paddle.fluid.io import save_inference_model
 
         def get_feed_fetch(all_vars, partial_vars):
@@ -1326,9 +1341,14 @@ def get_feed_fetch(all_vars, partial_vars):
                 assert target_var is not None, "{} cannot be found".format(name)
                 target_vars.append(target_var)
 
+            model_filename = file_prefix + INFER_MODEL_SUFFIX
+            params_filename = file_prefix + INFER_PARAMS_SUFFIX
+
             save_inference_model(
                 dirname=dirname,
                 feeded_var_names=feeded_var_names,
                 target_vars=target_vars,
                 executor=self._exe,
-                main_program=self._program.clone())
+                main_program=self._program.clone(),
+                model_filename=model_filename,
+                params_filename=params_filename)
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index b157ce81d82fc7..36637abc6d0b85 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -22,6 +22,9 @@
 import weakref
 import warnings
 from copy import deepcopy
+import inspect
+
+import paddle
 
 from . import parallel_helper
 from .. import unique_name
@@ -894,9 +897,15 @@ def __call__(self, *inputs, **kwargs):
             if not self._built:
                 with program_desc_tracing_guard(False):
                     self._build_once(*inputs, **kwargs)
-                    if parallel_helper._is_data_parallel_mode():
+
+                    # TODO(liuyuhui) Only xpu broadcast parameters here. 
+                    # The other device is to call _sync_params_buffers in DataParallel 
+                    # to realize the parameter synchronization among multiply cards.
+                    if parallel_helper._is_data_parallel_mode(
+                    ) and paddle.is_compiled_with_xpu():
                         parallel_helper._broadcast_parameters(
                             self._parameters.values())
+
                 self._built = True
 
             outputs = self.forward(*inputs, **kwargs)
@@ -1287,10 +1296,12 @@ def _check_match(key, param):
             if state is None:
                 raise ValueError("{} is not found in the provided dict.".format(
                     key))
-            if list(state.shape) != list(param.shape):
+            state_shape = state.shape() if inspect.ismethod(
+                state.shape) else state.shape
+            if list(state_shape) != list(param.shape):
                 raise ValueError(
                     "{} receives a shape {}, but the expected shape is {}.".
-                    format(key, list(state.shape), list(param.shape)))
+                    format(key, list(state_shape), list(param.shape)))
             return param, state
 
         matched_param_state = []
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 6decff69ad65cf..ce728f1121dfdb 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -174,6 +174,11 @@ def __init__(self,
                  dtype='float32'):
         assert param_attr is not False, "param_attr should not be False here."
         super(Conv2D, self).__init__()
+
+        if (core.is_compiled_with_cuda() and paddle.fluid.get_flags(
+                "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
+            use_cudnn = False
+
         self._num_channels = num_channels
         self._groups = groups
         self._stride = utils.convert_to_list(stride, 2, 'stride')
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 2ef72f6c5aaf4b..b80621e21f1c5c 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -24,6 +24,7 @@
 from paddle.fluid.dygraph import parallel_helper
 from paddle.fluid.dygraph import to_variable, no_grad
 from paddle.utils import deprecated
+from ..layers import collective
 import warnings
 import paddle
 import itertools
@@ -348,6 +349,18 @@ class DataParallel(layers.Layer):
         last_comm_buffer_size(float, optional): It limits memory size(MB) of last buffer in communication
                                          calling. Making the last communication buffer size small is useful to 
                                          improve performance. Default: 1.
+        find_unused_parameters(bool, optional): Whether to traverse the entire backward graph from the
+                                                all tensors in the return value of the wrapped model's 
+                                                forward function. For parameters not involved in loss 
+                                                calculation, their gradients will be marked as ready in 
+                                                advance to prepare reduce. Please note that all forward 
+                                                outputs derived from the wrapped model parameters must 
+                                                participate in the calculation of loss and subsequent 
+                                                gradient calculations. If not, serious error will occur.
+                                                Note that setting the find_unused_parameters to True 
+                                                will affect computing performance. Therefore, if all parameters
+                                                are sure to participate in the loss calculation and the 
+                                                autograd graph construction, please set it False. Default: True.
             
     Returns:
         Layer: The data paralleled module.
@@ -403,11 +416,13 @@ def __init__(self,
                  layers,
                  strategy=None,
                  comm_buffer_size=25,
-                 last_comm_buffer_size=1):
+                 last_comm_buffer_size=1,
+                 find_unused_parameters=True):
         super(DataParallel,
               self).__init__(layers.full_name() + "_data_parallel")
 
         self._layers = layers
+        self.find_unused_parameters = find_unused_parameters
 
         # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. 
         # It just stores some environment variables, which can be constructed by 
@@ -419,6 +434,17 @@ def __init__(self,
             self._strategy = _build_default_parallel_strategy()
 
         if self._strategy.nranks > 1:
+            # check the environment
+            assert parallel_helper.__parallel_ctx__clz__ is not None, \
+            "ParallelContext must be initialized before. You should use init_parallel_env() before" \
+            "constructing the DataParallel."
+
+            # sync buffer and params
+            # TODO(liuyuhui) Currently not support xpu. xpu is 
+            # still broadcasting parameters when calling layer
+            if not paddle.is_compiled_with_xpu():
+                self._sync_params_buffers()
+
             self.comm_buffer_size = int(comm_buffer_size * 1024 * 1024)
             # NOTE(shenliang03): We can set environment variables to control 
             # the size of the group, Default: 1MB. The role of this small group is: 
@@ -449,6 +475,10 @@ def init_reducer(self):
 
         trainable_parameters = [param for _, param in layers_param]
 
+        assert len(trainable_parameters) > 0, \
+            "This model does not have any parameters to train, and " \
+            "does not need to use DataParallel"
+
         # NOTE(shenliang03): Here we can only use the attributes to judge whether
         # parameter is sparse(or SelectedRows). The reason is that the sparse message
         # can't be obtained when bp hasn't happened yet. So if layer supports sparse parameter,
@@ -470,19 +500,12 @@ def check_layer_sparse(sublayer):
             trainable_parameters, is_sparse_gradient,
             [self.last_comm_buffer_size, self.comm_buffer_size])
 
-        assert parallel_helper.__parallel_ctx__clz__ is not None, \
-            "ParallelContext must be initialized before. You should use init_parallel_env() before" \
-            "constructing the DataParallel."
-
-        # TODO(shenliang03) "find_unused_vars" interface will be exposed in the future 
-        # to handle control flow to process unused parameters
-        find_unused_vars = True
         self._reducer = core.Reducer(
             trainable_parameters,
             list(reversed(self.group_indices)), is_sparse_gradient,
             parallel_helper.__parallel_ctx__clz__,
             [self.last_comm_buffer_size, self.comm_buffer_size],
-            find_unused_vars)
+            self.find_unused_parameters)
 
     def _find_varbase(self, obj):
         if isinstance(obj, core.VarBase):
@@ -493,11 +516,54 @@ def _find_varbase(self, obj):
             return itertools.chain(*map(self._find_varbase, obj.values()))
         return []
 
+    def _sync_params_buffers(self):
+        model_vars = []
+        for _, param in self._layers.state_dict().items():
+            if not isinstance(param, core.VarBase):
+                raise TypeError("The data type of '%s' must be Varbase" %
+                                param.name)
+            model_vars.append(param.detach())
+        if len(model_vars) == 0:
+            return
+
+        mega_bytes = 128 * 1024 * 1024
+        group_idx = 0
+        memory_counter = 0
+        var_groups = OrderedDict()
+        dtype = model_vars[0].dtype
+
+        for var in model_vars:
+            bytes = np.prod(var.shape) * core.size_of_dtype(var.dtype)
+            if memory_counter < mega_bytes and dtype == var.dtype:
+                memory_counter += bytes
+            else:
+                memory_counter = 0
+                dtype = var.dtype
+                group_idx += 1
+            var_groups.setdefault(group_idx, []).append(var)
+
+        coalesced_vars = _coalesce_tensors(var_groups)
+
+        for coalesced_var, _, _ in coalesced_vars:
+            collective._broadcast(coalesced_var, root=0, sync_mode=True)
+
+        for coalesced_var, origin_vars, var_shapes in coalesced_vars:
+            var_len = [np.prod(v_shape) for v_shape in var_shapes]
+            framework._dygraph_tracer().trace_op(
+                type='split',
+                inputs={'X': coalesced_var},
+                outputs={'Out': origin_vars},
+                attrs={'sections': var_len,
+                       'axis': 0})
+
     def forward(self, *inputs, **kwargs):
         outputs = self._layers(*inputs, **kwargs)
-        if self._strategy.nranks > 1:
-            self._reducer.prepare_for_backward(
-                list(self._find_varbase(outputs)))
+        if self._strategy.nranks > 1 and framework._dygraph_tracer()._has_grad:
+            if self.find_unused_parameters:
+                self._reducer.prepare_for_backward(
+                    list(self._find_varbase(outputs)))
+            else:
+                self._reducer.prepare_for_backward(list(self._find_varbase([])))
 
         return outputs
 
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index ac0944c5718908..ac594709867d1c 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -14,6 +14,8 @@
 
 import inspect
 import numpy as np
+import warnings
+import weakref
 
 import paddle
 from .. import framework
@@ -26,6 +28,34 @@
 from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE
 
 
+class TensorHookRemoveHelper(object):
+    """
+    A helper class that for removing Tensor gradient's hook.
+    """
+
+    def __init__(self, tensor, hook_id):
+        self._tensor_ref = weakref.ref(tensor)
+        self._hook_id = hook_id
+
+    def remove(self):
+        """
+        Remove reference Tensor's hook.
+
+        Returns:
+            bool: Return True if removed successfully
+        """
+        tensor = self._tensor_ref()
+        if tensor is not None:
+            res = tensor._remove_grad_hook(self._hook_id)
+            if res is True:
+                return True
+            else:
+                warnings.warn(
+                    "The backward hook (ID: %d) of Tensor `%s` you want to remove does not exist or has been removed."
+                    % (self._hook_id, tensor.name), RuntimeWarning)
+        return False
+
+
 def monkey_patch_varbase():
     @switch_to_static_graph
     def _to_static_var(self, to_parameter=False, **kwargs):
@@ -133,7 +163,7 @@ def set_value(self, value):
                                       framework._current_expected_place())
 
     @framework.dygraph_only
-    def backward(self, retain_graph=False):
+    def backward(self, grad_tensor=None, retain_graph=False):
         """
         Run backward of current Graph which starts from current Tensor.
 
@@ -142,17 +172,22 @@ def backward(self, retain_graph=False):
         You can clear gradient by ``Tensor.clear_grad()`` .
 
         Args:
+            grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None, 
+            the initial gradient values of the current Tensor would be Tensor filled with 1.0; 
+            if `grad_tensor` is not None, it must have the same length as the current Tensor.
+            Teh default value is None.
+
             retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would
                 like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter
                 :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient.
                 Defaults to False.
-
         Returns:
             NoneType: None
 
         Examples:
             .. code-block:: python
 
+                import paddle
                 x = paddle.to_tensor(5., stop_gradient=False)
                 for i in range(5):
                     y = paddle.pow(x, 4.0)
@@ -168,15 +203,36 @@ def backward(self, retain_graph=False):
                 print("{}".format(x.grad))
                 # 0.
 
+                grad_tensor=paddle.to_tensor(2.)
+                for i in range(5):
+                    y = paddle.pow(x, 4.0)
+                    y.backward(grad_tensor)
+                    print("{}: {}".format(i, x.grad))
+                # 0: [1000.]
+                # 1: [2000.]
+                # 2: [3000.]
+                # 3: [4000.]
+                # 4: [5000.]
+
         """
         if framework.in_dygraph_mode():
+            if grad_tensor is not None:
+                assert isinstance(
+                    grad_tensor, paddle.
+                    Tensor), "The type of grad_tensot must be paddle.Tensor"
+                assert grad_tensor.shape == self.shape, \
+                    "Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format(
+                    grad_tensor.name, grad_tensor.shape, self.name, self.shape)
+
             if paddle.is_compiled_with_xpu():
                 # TODO(liuyuhui): Currently only for xpu. Will be removed in the future.
                 scaled_loss = scale_loss(self)
-                scaled_loss._run_backward(framework._dygraph_tracer(),
-                                          retain_graph)
+                core.dygraph_run_backward([scaled_loss], [grad_tensor],
+                                          retain_graph,
+                                          framework._dygraph_tracer())
             else:
-                self._run_backward(framework._dygraph_tracer(), retain_graph)
+                core.dygraph_run_backward([self], [grad_tensor], retain_graph,
+                                          framework._dygraph_tracer())
         else:
             raise ValueError(
                 "Variable.backward() is only available in DyGraph mode")
@@ -211,6 +267,73 @@ def gradient(self):
         else:
             return np.array(new_ivar.value().get_tensor())
 
+    @framework.dygraph_only
+    def register_hook(self, hook):
+        """
+        Registers a backward hook for current Tensor.
+
+        The hook will be called every time the gradient Tensor of current Tensor is computed.
+
+        The hook should not modify the input gradient Tensor, but it can optionally return
+        a new gradient Tensor which will be used in place of current Tensor's gradient.
+
+        The hook should have the following signature:
+
+            hook(grad) -> Tensor or None
+
+        Args:
+            hook(function): A backward hook to be registered for Tensor.grad
+
+        Returns:
+            TensorHookRemoveHelper: A helper object that can be used to remove the registered hook by calling `remove()` method.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                # hook function return None
+                def print_hook_fn(grad):
+                    print(grad)
+
+                # hook function return Tensor
+                def double_hook_fn(grad):
+                    grad = grad * 2
+                    return grad
+
+                x = paddle.to_tensor([0., 1., 2., 3.], stop_gradient=False)
+                y = paddle.to_tensor([4., 5., 6., 7.], stop_gradient=False)
+                z = paddle.to_tensor([1., 2., 3., 4.])
+
+                # one Tensor can register multiple hooks
+                h = x.register_hook(print_hook_fn)
+                x.register_hook(double_hook_fn)
+
+                w = x + y
+                # register hook by lambda function
+                w.register_hook(lambda grad: grad * 2)
+
+                o = z.matmul(w)
+                o.backward()
+                # print_hook_fn print content in backward
+                # Tensor(shape=[4], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
+                #        [2., 4., 6., 8.])
+
+                print("w.grad:", w.grad) # w.grad: [1. 2. 3. 4.]
+                print("x.grad:", x.grad) # x.grad: [ 4.  8. 12. 16.]
+                print("y.grad:", y.grad) # y.grad: [2. 4. 6. 8.]
+
+                # remove hook
+                h.remove()
+        """
+        if self.stop_gradient is True:
+            raise RuntimeError(
+                "Cannot register hook on a tensor that stop gradient.")
+
+        hook_id = self._register_grad_hook(hook)
+        helper = TensorHookRemoveHelper(self, hook_id)
+        return helper
+
     @property
     def grad(self):
         """
@@ -316,7 +439,8 @@ def __bool__(self):
         ("_to_static_var", _to_static_var), ("set_value", set_value),
         ("block", block), ("backward", backward), ("clear_grad", clear_grad),
         ("inplace_version", inplace_version), ("grad", grad),
-        ("gradient", gradient), ("__str__", __str__), ("__repr__", __str__),
+        ("gradient", gradient), ("register_hook", register_hook),
+        ("__str__", __str__), ("__repr__", __str__),
         ("__deepcopy__", __deepcopy__), ("__module__", "paddle"),
         ("__name__", "Tensor")):
         setattr(core.VarBase, method_name, method)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index be795b9e59c097..d5c01d20a91824 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -24,6 +24,7 @@
 import traceback
 import six
 import copy
+from types import MethodType, FunctionType
 
 import numpy as np
 import subprocess
@@ -1183,37 +1184,6 @@ def numpy(self):
         """
         pass
 
-    @fake_interface_only
-    def set_value(self, value):
-        """
-        **Notes**:
-            **This API is ONLY available in Dygraph mode**
-
-        Set a new value for this Variable.
-
-        Args:
-            value (Variable|np.ndarray): the new value.
-
-        Examples:
-            .. code-block:: python
-
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph.base import to_variable
-                from paddle.fluid.dygraph import Linear
-                import numpy as np
-
-                data = np.ones([3, 1024], dtype='float32')
-                with fluid.dygraph.guard():
-                    linear = fluid.dygraph.Linear(1024, 4)
-                    t = to_variable(data)
-                    linear(t)  # call with default weight
-                    custom_weight = np.random.randn(1024, 4).astype("float32")
-                    linear.weight.set_value(custom_weight)  # change existing weight
-                    out = linear(t)  # call with different weight
-
-        """
-        pass
-
     @fake_interface_only
     def backward(self, retain_graph=False):
         """
@@ -2011,6 +1981,159 @@ def replace_ellipsis(item):
 
         return self
 
+    def get_value(self, scope=None):
+        """
+        Get the value of variable in given scope. 
+
+        Args:
+            scope(Scope, optional) : If `scope` is None, it will be set to global scope 
+                obtained through 'paddle.static.global_scope()'. Otherwise, use `scope`.
+                Default: None
+
+        Returns:
+            Tensor: the value in given scope.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.static as static 
+                import numpy as np
+
+                paddle.enable_static()
+
+                x = static.data(name="x", shape=[10, 10], dtype='float32')
+
+                y = static.nn.fc(x, 10, name='fc')
+                place = paddle.CPUPlace()
+                exe = static.Executor(place)
+                prog = paddle.static.default_main_program()
+                exe.run(static.default_startup_program())
+                inputs = np.ones((10, 10), dtype='float32')
+                exe.run(prog, feed={'x': inputs}, fetch_list=[y, ])
+                path = 'temp/tensor_'
+                for var in prog.list_vars():
+                    if var.persistable:
+                        t = var.get_value()
+                        paddle.save(t, path+var.name+'.pdtensor')
+
+                for var in prog.list_vars():
+                    if var.persistable:
+                        t_load = paddle.load(path+var.name+'.pdtensor')
+                        var.set_value(t_load)
+        """
+        # The 'framework' is a low-level module, and 'executor' 
+        # can not be imported at the begainning of this file. 
+        # Therefore, the above two modules are dynamically imported.
+        from .executor import global_scope
+        if scope is not None and not isinstance(scope, core._Scope):
+            raise TypeError(
+                "`scope` should be None or `paddle.static.Scope` type, but received {}.".
+                format(type(scope)))
+
+        if scope is None:
+            scope = global_scope()
+        var_temp = scope.find_var(self.name)
+        if var_temp is None:
+            raise ValueError("Can not find Variable '{}' in the Scope.".format(
+                self.name))
+        t = var_temp.get_tensor()
+        return t
+
+    def set_value(self, value, scope=None):
+        '''
+        Set the value to the tensor in given scope. 
+
+        Args:
+            value(Tensor/ndarray) : The value to be set.
+            scope(Scope, optional) : If `scope` is None, it will be set to global scope 
+                obtained through 'paddle.static.global_scope()'. Otherwise, use `scope`.
+                Default: None
+
+        Returns:
+            None
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.static as static 
+                import numpy as np
+
+                paddle.enable_static()
+
+                x = static.data(name="x", shape=[10, 10], dtype='float32')
+
+                y = static.nn.fc(x, 10, name='fc')
+                place = paddle.CPUPlace()
+                exe = static.Executor(place)
+                prog = paddle.static.default_main_program()
+                exe.run(static.default_startup_program())
+                inputs = np.ones((10, 10), dtype='float32')
+                exe.run(prog, feed={'x': inputs}, fetch_list=[y, ])
+                path = 'temp/tensor_'
+                for var in prog.list_vars():
+                    if var.persistable:
+                        t = var.get_value()
+                        paddle.save(t, path+var.name+'.pdtensor')
+
+                for var in prog.list_vars():
+                    if var.persistable:
+                        t_load = paddle.load(path+var.name+'.pdtensor')
+                        var.set_value(t_load)
+        '''
+
+        # The 'framework' is a low-level module, and 'executor'
+        # can not be imported at the begainning of this file. 
+        # Therefore, the above two modules are dynamically imported.
+        from .executor import global_scope
+
+        if not (isinstance(value, np.ndarray) or hasattr(value, '__array__')):
+            raise TypeError(
+                "`value` should be `numpy.ndarray` or `LoDTensor`, but received {}.".
+                format(type(value)))
+
+        if scope is not None and not isinstance(scope, core._Scope):
+            raise TypeError(
+                "`scope` should be None or `paddle.static.Scope` type, but received {}.".
+                format(type(scope)))
+
+        if scope is None:
+            scope = global_scope()
+
+        var_temp = scope.find_var(self.name)
+        if var_temp is None:
+            raise ValueError("Can not find Variable '{}' in the Scope.".format(
+                self.name))
+
+        t = var_temp.get_tensor()
+
+        if hasattr(value, 'shape'):
+            if isinstance(value.shape, (MethodType, FunctionType)):
+                value_shape = value.shape()
+            else:
+                value_shape = value.shape
+            if list(t.shape()) != list(value_shape):
+                raise ValueError(
+                    "{} expected a shape {}, but the received shape is {}.".
+                    format(self.name, list(t.shape()), list(value_shape)))
+
+        p = t._place()
+        if p.is_cpu_place():
+            place = core.CPUPlace()
+        elif p.is_cuda_pinned_place():
+            place = core.CUDAPinnedPlace()
+        elif p.is_xpu_place():
+            p = core.Place()
+            p.set_place(t._place())
+            place = core.XPUPlace(p.xpu_device_id())
+        else:
+            p = core.Place()
+            p.set_place(t._place())
+            place = core.CUDAPlace(p.gpu_device_id())
+
+        t.set(value, place)
+
 
 def get_all_op_protos():
     """
@@ -5319,6 +5442,173 @@ def all_parameters(self):
             parameters.extend(each_block.all_parameters())
         return parameters
 
+    def state_dict(self, mode='all', scope=None):
+        """
+        Get parameters and persistable buffers of program as a dict. The key is the name of the parameter or the name of the buffer.
+        The value is the tensor of this variable in the given scope.
+
+        .. note::
+            This function MUST called after run start_up_program
+
+        Args:
+            mode(str, optional): Source of the obtained parameters and buffers. 
+                    'opt' :  The return value only contains the variable in the optimizer. 
+                    'param' : The return value only contains the variable in the network, not the variable in the optimizer.  
+                    'all' : The return value contains the variable in the network and optimizer.
+                    Default: 'all'
+            scope(Scope, optional) : If scope is None, state_dict will be set to global scope 
+                obtained through 'paddle.static.global_scope()'. Otherwise, value will be set to scope.
+                Default: None
+
+        Retruns:
+            dict: a dict contains the parameters and persistable buffers.
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                x = static.data(name="x", shape=[10, 10], dtype='float32')
+                y = static.nn.fc(x, 10)
+                z = static.nn.fc(y, 10)
+
+                place = paddle.CPUPlace()
+                exe = static.Executor(place)
+                exe.run(static.default_startup_program())
+                prog = static.default_main_program()
+
+                path = "./temp/model.pdparams"
+                paddle.save(prog.state_dict(), path)
+        """
+        # The 'framework' is a low-level module, and 'executor'
+        # can not be imported at the begainning of this file. 
+        # Therefore, the above two modules are dynamically imported.
+        from .executor import global_scope
+        if scope is not None and not isinstance(scope, core._Scope):
+            raise TypeError(
+                "`scope` should be None or `paddle.static.Scope'` type, but received {}.".
+                format(type(scope)))
+
+        if scope is None:
+            scope = global_scope()
+
+        if not isinstance(mode, str):
+            raise TypeError("Type of `mode` should be string, but received {}.".
+                            format(type(mode)))
+
+        def is_parameter(var):
+            return isinstance(var, Parameter)
+
+        def is_persistable(var):
+            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                var.desc.type() == core.VarDesc.VarType.READER:
+                return False
+            return var.persistable
+
+        def is_belong_to_optimizer(var):
+            if not (isinstance(var, Parameter) or var.desc.need_check_feed()):
+                return is_persistable(var)
+            return False
+
+        def condition(var):
+
+            if mode == 'param':
+                return is_parameter(var)
+            elif mode == 'opt':
+                return is_belong_to_optimizer(var)
+            elif mode == 'all':
+                return is_parameter(var) or is_belong_to_optimizer(var)
+            else:
+                raise ValueError(
+                    "`mode` string should be 'param', 'opt' or 'all', but received {}.".
+                    format(mode))
+
+        var_list = filter(condition, self.list_vars())
+
+        state_dict = dict()
+        for var in var_list:
+            var_temp = scope.find_var(var.name)
+            if var_temp is None:
+                raise ValueError(
+                    "Can not find Variable '{}' in the scope. Make sure it is initialized".
+                    format(var.name))
+            state_dict[var.name] = var_temp.get_tensor()
+
+        return state_dict
+
+    def set_state_dict(self, state_dict, scope=None):
+        """
+        Set parameters and persistable buffers in state_dict to program. 
+        An exception will throw if shape or dtype of the parameters is not match.
+        
+        .. note::
+            This function MUST called after run start_up_program
+
+        Args:
+            state_dict(dict): the dict store parameters and persistable buffers. 
+                The key is the name of the parameter or the name of the buffer.
+                The value is the tensor of this variable in the given scope.
+            scope(Scope, optional) : If scope is None, state_dict will be set to global scope 
+                obtained through 'paddle.static.global_scope()'. Otherwise, value will be set to scope.
+                Default: None
+        
+        Returns:
+            None
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import paddle.static as static
+
+                paddle.enable_static()
+
+                x = static.data(name="x", shape=[10, 10], dtype='float32')
+                y = static.nn.fc(x, 10)
+                z = static.nn.fc(y, 10)
+
+                place = paddle.CPUPlace()
+                exe = static.Executor(place)
+                exe.run(static.default_startup_program())
+                prog = static.default_main_program()
+
+                path = "./temp/model.pdparams"
+                paddle.save(prog.state_dict(), path)
+                state_dict_load = paddle.load(path)
+                prog.set_state_dict(state_dict_load)
+        """
+
+        if not isinstance(state_dict, dict):
+            raise TypeError(
+                "Type of `state_dict` should be dict, but received {}.".format(
+                    type(state_dict)))
+
+        vars_dict = {var.name: var for var in self.list_vars()}
+        condition = True if 'StructuredToParameterName@@' in state_dict else False
+        for name, value in state_dict.items():
+            if condition:
+                if name == "StructuredToParameterName@@":
+                    continue
+                if name in state_dict['StructuredToParameterName@@']:
+                    name = state_dict['StructuredToParameterName@@'][name]
+            if name in vars_dict:
+                try:
+                    vars_dict[name].set_value(value, scope)
+                except ValueError as err:
+                    warnings.warn(
+                        ("Skip loading for '{}'. ".format(name) + str(err)))
+                except TypeError as err:
+                    warnings.warn(
+                        ("Skip loading for '{}'. ".format(name) + str(err)))
+            else:
+                warnings.warn((
+                    "Skip loading for '{0}'. Because '{0}' not in the program.".
+                    format(name)))
+
 
 @six.add_metaclass(ParameterMetaClass)
 class Parameter(Variable):
diff --git a/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh b/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh
index 1df6b0618de8d7..cac2f7234bdf2f 100644
--- a/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh
+++ b/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # start pserver0
 python fleet_deep_ctr.py \
     --role pserver \
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 9cca3e16de5132..cfb4b125993855 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1765,7 +1765,30 @@ def _pack_loaded_dict(load_obj):
 
 
 @static_only
-def save(program, model_path, pickle_protocol=2):
+def _legacy_save(param_dict, model_path, protocol=2):
+    def get_tensor(var):
+        if isinstance(var, core.VarBase):
+            return var.numpy()
+        elif isinstance(var, core.LoDTensor):
+            return np.array(var)
+        return var
+
+    param_dict = {name: get_tensor(param_dict[name]) for name in param_dict}
+
+    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+    if sys.platform == 'darwin' and sys.version_info.major == 3:
+        pickle_bytes = pickle.dumps(param_dict, protocol=protocol)
+        with open(model_path, 'wb') as f:
+            max_bytes = 2**30
+            for i in range(0, len(pickle_bytes), max_bytes):
+                f.write(pickle_bytes[i:i + max_bytes])
+    else:
+        with open(model_path, 'wb') as f:
+            pickle.dump(param_dict, f, protocol=protocol)
+
+
+@static_only
+def save(program, model_path, protocol=2, **configs):
     """
     :api_attr: Static Graph
 
@@ -1778,8 +1801,9 @@ def save(program, model_path, pickle_protocol=2):
     Args:
         program(Program) : The program to saved.
         model_path(str): the file prefix to save the program. The format is "dirname/file_prefix". If file_prefix is empty str. A exception will be raised
-        pickle_protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
+        protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
                                  Default: 2
+        configs(dict, optional) : optional keyword arguments.                        
 
     Returns:
         None
@@ -1807,14 +1831,19 @@ def save(program, model_path, pickle_protocol=2):
     base_name = os.path.basename(model_path)
     assert base_name != "", \
         "The input model_path MUST be format of dirname/filename [dirname\\filename in Windows system], but received model_path is empty string."
+    if 'pickle_protocol' in configs:
+        protocol = configs['pickle_protocol']
+        warnings.warn(
+            "'pickle_protocol' is a deprecated argument. Please use 'protocol' instead."
+        )
 
-    if not isinstance(pickle_protocol, int):
+    if not isinstance(protocol, int):
         raise ValueError("The 'protocol' MUST be `int`, but received {}".format(
-            type(pickle_protocol)))
+            type(protocol)))
 
-    if pickle_protocol < 2 or pickle_protocol > 4:
+    if protocol < 2 or protocol > 4:
         raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
-                         format(pickle_protocol))
+                         format(protocol))
 
     dir_name = os.path.dirname(model_path)
     if dir_name and not os.path.exists(dir_name):
@@ -1827,26 +1856,25 @@ def get_tensor(var):
     parameter_list = list(filter(is_parameter, program.list_vars()))
     param_dict = {p.name: get_tensor(p) for p in parameter_list}
 
-    param_dict = _unpack_saved_dict(param_dict, pickle_protocol)
+    param_dict = _unpack_saved_dict(param_dict, protocol)
 
-    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3.5/6'
-    if sys.platform == 'darwin' and sys.version_info.major == 3 and (
-            sys.version_info.minor == 5 or sys.version_info.minor == 6):
-        pickle_bytes = pickle.dumps(param_dict, protocol=pickle_protocol)
+    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+    if sys.platform == 'darwin' and sys.version_info.major == 3:
+        pickle_bytes = pickle.dumps(param_dict, protocol=protocol)
         with open(model_path + ".pdparams", 'wb') as f:
             max_bytes = 2**30
             for i in range(0, len(pickle_bytes), max_bytes):
                 f.write(pickle_bytes[i:i + max_bytes])
     else:
         with open(model_path + ".pdparams", 'wb') as f:
-            pickle.dump(param_dict, f, protocol=pickle_protocol)
+            pickle.dump(param_dict, f, protocol=protocol)
 
     optimizer_var_list = list(
         filter(is_belong_to_optimizer, program.list_vars()))
 
     opt_dict = {p.name: get_tensor(p) for p in optimizer_var_list}
     with open(model_path + ".pdopt", 'wb') as f:
-        pickle.dump(opt_dict, f, protocol=pickle_protocol)
+        pickle.dump(opt_dict, f, protocol=protocol)
 
     main_program = program.clone()
     program.desc.flush()
@@ -1857,6 +1885,17 @@ def get_tensor(var):
         f.write(program.desc.serialize_to_string())
 
 
+def _pickle_loads_mac(path, f):
+    pickle_bytes = bytearray(0)
+    file_size = os.path.getsize(path)
+    max_bytes = 2**30
+    for _ in range(0, file_size, max_bytes):
+        pickle_bytes += f.read(max_bytes)
+    load_result = pickle.loads(pickle_bytes) if six.PY2 else pickle.loads(
+        pickle_bytes, encoding='latin1')
+    return load_result
+
+
 @static_only
 def load(program, model_path, executor=None, var_list=None):
     """
@@ -2016,8 +2055,13 @@ def set_var(var, ndarray):
                                                    global_scope(),
                                                    executor._default_executor)
     with open(parameter_file_name, 'rb') as f:
-        load_dict = pickle.load(f) if six.PY2 else pickle.load(
-            f, encoding='latin1')
+
+        # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+        if sys.platform == 'darwin' and sys.version_info.major == 3:
+            load_dict = _pickle_loads_mac(parameter_file_name, f)
+        else:
+            load_dict = pickle.load(f) if six.PY2 else pickle.load(
+                f, encoding='latin1')
         load_dict = _pack_loaded_dict(load_dict)
     for v in parameter_list:
         assert v.name in load_dict, \
@@ -2196,8 +2240,12 @@ def _load_vars_with_try_catch(exe,
         "Parameter file [{}] not exits".format(parameter_file_name)
 
     with open(parameter_file_name, 'rb') as f:
-        para_dict = pickle.load(f) if six.PY2 else pickle.load(
-            f, encoding='latin1')
+        # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+        if sys.platform == 'darwin' and sys.version_info.major == 3:
+            para_dict = _pickle_loads_mac(parameter_file_name, f)
+        else:
+            para_dict = pickle.load(f) if six.PY2 else pickle.load(
+                f, encoding='latin1')
     para_dict = _pack_loaded_dict(para_dict)
 
     opt_file_name = model_prefix + ".pdopt"
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 6bc69ffd5cd324..34dc1e9b346ecb 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -9260,6 +9260,9 @@ def affine_grid(theta, out_shape, name=None):
                                  'affine_grid')
     else:
         attrs['output_shape'] = out_shape
+    if core.is_compiled_with_rocm():
+        # ROCM platform do not have MIOPEN kernel for affine_grid
+        attrs['use_cudnn'] = False
 
     helper.append_op(
         type='affine_grid',
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 84f99962e84307..7458466b02fd4e 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -635,7 +635,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
             If ``shape`` is an Tensor, it should be an 1-D Tensor with date type int32 or int64.
         dtype(np.dtype|str): Data type of the output Tensor which can
-            be float16, float32, float64, int32, int64.
+            be float16, float32, float64, uint8, int32, int64.
         value(bool|float|int|Tensor): The constant value used to initialize 
             the Tensor to be created. If ``value`` is an Tensor, it should be an 1-D Tensor.
         force_cpu(bool, optional): data should be on CPU if it's true, default value is False.
@@ -673,7 +673,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
     attrs = {'force_cpu': force_cpu}
     dtype = convert_dtype(dtype)
     if not isinstance(value, Variable):
-        if dtype in ['int64', 'int32']:
+        if dtype in ['uint8', 'int64', 'int32']:
             attrs['str_value'] = str(int(value))
             attrs['value'] = int(value)
         else:
@@ -686,7 +686,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
             out = _varbase_creator(dtype=dtype)
 
         if isinstance(value, Variable):
-            if dtype in ['int64', 'int32']:
+            if dtype in ['uint8', 'int64', 'int32']:
                 attrs['str_value'] = str(int(value.numpy().item(0)))
             else:
                 attrs['str_value'] = str(float(value.numpy().item(0)))
@@ -706,9 +706,10 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
         inputs['ValueTensor'] = value
 
     check_shape(shape)
-    check_dtype(dtype, 'dtype',
-                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-                'fill_constant')
+    check_dtype(
+        dtype, 'dtype',
+        ['bool', 'float16', 'float32', 'float64', 'uint8', 'int32', 'int64'],
+        'fill_constant')
     check_type(shape, 'shape', (Variable, list, tuple), 'fill_constant')
 
     if out is not None:
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 2aa918bf806616..27ce44a257e786 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -3805,7 +3805,6 @@ def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0):
         self._param_device_map = None
         self._pipeline_pair = []
         self._pp_ring_map = dict()
-        self._global_ring_id = None
 
     # insert allreduce op to sync global information for global
     # gradient clip and amp
@@ -3841,7 +3840,7 @@ def _insert_allreduce_op(self, op_idx, block):
             inputs={'X': temp_var if op.type == "reduce_any" else out_var},
             outputs={'Out': temp_var if op.type == "reduce_any" else out_var},
             attrs={
-                'ring_id': self._global_ring_id,
+                'ring_id': self.global_ring_id,
                 self._op_role_key: self._op_role.Optimize,
                 'use_calc_stream': True
             })
@@ -3887,6 +3886,16 @@ def _create_vars(self, block, ori_block):
                         reserved_x.append(input_name)
                 op.desc.set_input('X', reserved_x)
                 op.desc.set_output('Out', reserved_x)
+            elif op.type == 'check_finite_and_unscale':
+                for input_name in op.desc.input("X"):
+                    if block._find_var_recursive(input_name):
+                        reserved_x.append(input_name)
+                op.desc.set_input('X', reserved_x)
+                op.desc.set_output('Out', reserved_x)
+                if len(reserved_x) == 0:
+                    block._remove_op(op_idx)
+                    op_size -= 1
+                    continue
             elif op.type == 'sum' and self._is_gradient_clip_op(op):
                 for input_name in op.desc.input("X"):
                     if block._find_var_recursive(input_name):
@@ -4020,63 +4029,38 @@ def _split_startup_program(self, startup_program, device_id):
         self._create_vars(new_startup_program.global_block(), block)
         return new_startup_program
 
-    def _find_post_op(self, ops, cur_op, var_name):
+    def _find_post_op(self, index, var_name):
         """
-        Find the real post op that has variable named var_name as input.
-
-        Args:
-            ops (list): A list of ops.
-            cur_op (Operator): Current operator which has variable named
-                               var_name as output.
-            var_name (string): Variable name.
+        Find the post op that has variable named var_name as input.
         """
-        # To skip the cast op added by amp which has no op_device set
+        # bugfix for uniform hybrid parallelism
         if '.cast_fp32' in var_name:
             var_name = var_name.replace('.cast_fp32', '')
-        elif '.cast_fp16' in var_name:
+        if '.cast_fp16' in var_name:
             var_name = var_name.replace('.cast_fp16', '')
-        post_op = []
-        before = True
-        for op in ops:
-            if op == cur_op:
-                before = False
-                continue
-            if before:
-                continue
-            for in_var_name in op.input_arg_names:
-                if in_var_name == var_name:
-                    post_op.append(op)
-                    break
-        if post_op:
-            return post_op[0]
-        return None
 
-    def _find_real_prev_op(self, ops, cur_op, var_name):
-        """
-        Find the real previous op that outputs variable named var_name.
+        post_ops = self.input_var_to_op[var_name]
+        if post_ops == None: return None
+        result_op = None
+        for post_op, post_idx in reversed(post_ops):
+            if post_idx > index:
+                result_op = post_op
+                break
+        return result_op
 
-        Args:
-            ops (list): A list of ops.
-            cur_op (Operator): Current operator which has variable named
-                               var_name as input.
-            var_name (string): Variable name.
+    def _find_prev_op(self, index, var_name):
         """
-        prev_op = []
-        for op in ops:
-            if op.type == 'send_v2' or op.type == 'recv_v2' \
-                or op.type == 'c_broadcast':
-                continue
-            if op == cur_op:
+        Find the previous op of op with index that outputs
+        variable named var_name.
+        """
+        prev_ops = self.output_var_to_op[var_name]
+        if prev_ops == None: return None
+        result_op = None
+        for prev_op, prev_idx in reversed(prev_ops):
+            if prev_idx < index:
+                result_op = prev_op
                 break
-            for out_var_name in op.output_arg_names:
-                if out_var_name == var_name:
-                    prev_op.append(op)
-        if prev_op:
-            # A op may have more than one prev op,
-            # e.g., for 'learning_rate', there may be multiple ops have it as
-            # output.
-            return prev_op[-1]
-        return None
+        return result_op
 
     def _rename_arg(self, op, old_name, new_name):
         op._rename_input(old_name, new_name)
@@ -4136,23 +4120,37 @@ def _add_op_device_attr_for_op(self, op, idx, block):
             # For LRSched ops, we should put them on all sub-programs to
             # make sure each sub-program update the lr correctly
             op._set_attr(self._op_device_key, "gpu:all")
+        # bugfix in hybrid parallelism
+        elif op.type == "sum" and self._is_backward_op(op):
+            # For sum ops that compute the sum of @RENAMED@ vars
+            for name in op.desc.input_arg_names():
+                assert '@RENAME@' in name, \
+                    "The op must be sum used to accumulate renamed vars."
+            assert len(op.desc.output_arg_names()) == 1
+            out_name = op.desc.output_arg_names()[0]
+            post_op = self._find_post_op(idx, out_name)
+            assert post_op.has_attr(
+                'op_device'), "{} has no op_device attr for var {}".format(
+                    post_op.type, out_name)
+            device = post_op.attr(self._op_device_key)
+            assert device, "The post op must have op_device set."
+            op._set_attr(self._op_device_key, device)
         elif (op.type == "cast" or
               op.type == "scale") and self._is_backward_op(op):
-            prev_op = self._find_real_prev_op(block.ops, op,
-                                              op.desc.input("X")[0])
+            prev_op = self._find_prev_op(idx, op.desc.input("X")[0])
             op._set_attr(self._op_device_key, prev_op.attr(self._op_device_key))
         elif op.type == "memcpy" and not self._is_optimize_op(op):
+            # for checkpoint offloading
             assert len(op.input_arg_names) == 1 and len(
                 op.output_arg_names) == 1
             input_name = op.input_arg_names[0]
             output_name = op.output_arg_names[0]
             if '@Fetch' in output_name:
-                post_op = self._find_post_op(block.ops, op, output_name)
+                post_op = self._find_post_op(idx, output_name)
                 op._set_attr(self._op_device_key,
                              post_op.attr(self._op_device_key))
             else:
-                prev_op = self._find_real_prev_op(block.ops, op,
-                                                  op.desc.input("X")[0])
+                prev_op = self._find_prev_op(idx, op.desc.input("X")[0])
                 op._set_attr(self._op_device_key,
                              prev_op.attr(self._op_device_key))
         elif self._is_loss_op(op):
@@ -4165,16 +4163,11 @@ def _add_op_device_attr_for_op(self, op, idx, block):
             assert device, "Please put you program within device_guard scope."
             for i in range(offset):
                 block.ops[idx + i]._set_attr(self._op_device_key, device)
-        elif self._is_optimize_op(op) and op.type == "check_finite_and_unscale":
-            op_role_var = op.attr(self._op_role_var_key)
-            param_name = op_role_var[0]
-            device = self._param_device_map[param_name]
-            op._set_attr(self._op_device_key, device)
         elif self._is_optimize_op(op) and op.type == "cast":
             # For fp16-->fp32 cast added by AMP
             grad_name = op.output('Out')
             assert len(grad_name) == 1
-            param_name = grad_name[0].strip(core.grad_var_suffix())
+            param_name = self._strip_grad_suffix(grad_name[0])
             device = self._param_device_map[param_name]
             op._set_attr(self._op_device_key, device)
         elif self._is_gradient_clip_op(op) or self._is_regularization_op(op):
@@ -4197,7 +4190,11 @@ def _add_op_device_attr_for_op(self, op, idx, block):
             op._set_attr(self._op_device_key, device)
         else:
             other_known_ops = [
-                'update_loss_scaling', 'reduce_any', 'concat', 'sum'
+                'update_loss_scaling',
+                'reduce_any',
+                'concat',
+                'sum',
+                'check_finite_and_unscale',
             ]
             assert op.type in other_known_ops, "For other ops without " \
                 "op_device set, they must be one of {}, but it " \
@@ -4274,41 +4271,78 @@ def _insert_sendrecv_ops_for_boundaries(self, block):
         Insert a pair of send and recv ops for every two
         consecutive ops on different devices.
         """
-        extra_index = 0
-
         # A map from var to device where op takes it as input,
         # avoiding multiple send and recv ops.
-        var_dev_map = dict()
+        input_var_to_device = dict()
+        # bugfix hybrid parallelism
+        first_optimize_index = None
+        for index, op in enumerate(list(block.ops)):
+            if self._is_optimize_op(op):
+                first_optimize_index = index
+                break
+        extra_index_info = {
+            'index': 0,
+            'first_optimize_index': first_optimize_index
+        }
 
         for index, op in enumerate(list(block.ops)):
             cur_device = op.attr(self._op_device_key)
             if cur_device == "gpu:all": continue
             for var_name in op.input_arg_names:
-                # i.e., lod_tensor_blocking_queue created by DataLoader,
-                # which only exists in startup program.
                 var = block.var(var_name)
-                # skip data, because we will process it later
+                # skip data var
                 if var.is_data: continue
                 prev_device = None
-                if var_name in self._param_device_map:
+                generate_ops = self.output_var_to_op.get(var_name)
+                if generate_ops is None:
+                    if var_name not in self._param_device_map:
+                        continue
                     prev_device = self._param_device_map[var_name]
-                prev_op = self._find_real_prev_op(block.ops, op, var_name)
+
+                prev_op = self._find_prev_op(index, var_name)
+
                 if not prev_device:
                     prev_device = prev_op.attr(self._op_device_key) \
                         if prev_op else None
-                if not prev_device or prev_device == 'gpu:all': continue
 
-                if prev_device != cur_device:
-                    if var_name not in var_dev_map: var_dev_map[var_name] = []
-                    if cur_device in var_dev_map[var_name]: continue
-                    var_dev_map[var_name].append(cur_device)
+                if prev_device is None or prev_device == "gpu:all": continue
 
-                    op_role = op.all_attrs()[self._op_role_key]
+                if prev_device == cur_device: continue
+
+                if var_name not in input_var_to_device:
+                    input_var_to_device[var_name] = []
+                if (cur_device, prev_device) in input_var_to_device[var_name]:
+                    continue
+
+                device_type = cur_device.split(':')[0] + ':'
+
+                def _insert_send_recv(cur_id, prev_id):
+                    cur_dev = device_type + str(cur_id)
+                    prev_dev = device_type + str(prev_id)
+                    if (cur_dev, prev_dev) in input_var_to_device[var_name]:
+                        return
+
+                    if cur_id - prev_id > 1:
+                        _insert_send_recv(cur_id - 1, prev_id)
+                        _insert_send_recv(cur_id, cur_id - 1)
+                        input_var_to_device[var_name].append(
+                            (cur_dev, prev_dev))
+                        return
+                    elif cur_id - prev_id < -1:
+                        _insert_send_recv(cur_id + 1, prev_id)
+                        _insert_send_recv(cur_id, cur_id + 1)
+                        input_var_to_device[var_name].append(
+                            (cur_dev, prev_dev))
+                        return
+
+                    assert abs(cur_id - prev_id) == 1
+                    input_var_to_device[var_name].append((cur_dev, prev_dev))
+
+                    op_role = op.attr(self._op_role_key)
                     var = block.vars[var_name]
-                    prev_device_index = int(prev_device.split(':')[1])
-                    cur_device_index = int(cur_device.split(':')[1])
-                    pair = (prev_device_index, cur_device_index)
-                    pair_key = prev_device_index * 1000 + cur_device_index
+                    pair = (prev_id, cur_id)
+                    # 1000 is just a magic number
+                    pair_key = prev_id * 1000 + cur_id
                     if pair not in self._pipeline_pair:
                         self._pipeline_pair.append(pair)
                         self._pp_ring_map[pair_key] = self.ring_id
@@ -4316,89 +4350,104 @@ def _insert_sendrecv_ops_for_boundaries(self, block):
                         self.ring_id += 1
                     else:
                         ring_id = self._pp_ring_map[pair_key]
+
                     if self.schedule_mode == 'F-then-B':  # F-then-B
                         block._insert_op(
-                            index=index + extra_index,
+                            index=index + extra_index_info['index'],
                             type='send_v2',
                             inputs={'X': var},
                             attrs={
-                                self._op_device_key: prev_device,
+                                self._op_device_key: prev_dev,
                                 self._op_role_key: op_role,
                                 'use_calc_stream': True,
                                 'peer': 1,
                                 'ring_id': ring_id
                             })
-                        extra_index += 1
+                        extra_index_info['index'] += 1
                         block._insert_op(
-                            index=index + extra_index,
+                            index=index + extra_index_info['index'],
                             type='recv_v2',
                             outputs={'Out': [var]},
                             attrs={
                                 'out_shape': var.shape,
                                 'dtype': var.dtype,
-                                self._op_device_key: cur_device,
+                                self._op_device_key: cur_dev,
                                 self._op_role_key: op_role,
                                 'use_calc_stream': True,
                                 'peer': 0,
                                 'ring_id': ring_id
                             })
-                        extra_index += 1
+                        extra_index_info['index'] += 1
                     elif self.schedule_mode == '1F1B':  # 1F1B
                         block._insert_op(
-                            index=index + extra_index,
+                            index=index + extra_index_info['index'],
                             type='c_sync_calc_stream',
                             inputs={'X': [var]},
                             outputs={'Out': [var]},
                             attrs={
-                                self._op_device_key: prev_device,
+                                self._op_device_key: prev_dev,
                                 self._op_role_key: op_role,
                             })
-                        extra_index += 1
+                        extra_index_info['index'] += 1
                         block._insert_op(
-                            index=index + extra_index,
+                            index=index + extra_index_info['index'],
                             type='send_v2',
                             inputs={'X': var},
                             attrs={
-                                self._op_device_key: prev_device,
+                                self._op_device_key: prev_dev,
                                 self._op_role_key: op_role,
                                 'use_calc_stream': False,
                                 'ring_id': ring_id,
                                 'peer': 1,
                             })
-                        extra_index += 1
+                        extra_index_info['index'] += 1
+                        insert_index = None
+                        if int(op_role) == int(self._op_role.Backward):
+                            insert_index = extra_index_info[
+                                'first_optimize_index']
+                            new_op_role = self._op_role.Optimize
+                        else:
+                            insert_index = index
+                            new_op_role = self._op_role.Backward
                         block._insert_op(
-                            index=index + extra_index,
+                            index=insert_index + extra_index_info['index'],
                             type='c_sync_comm_stream',
                             inputs={'X': [var]},
                             outputs={'Out': [var]},
                             attrs={
-                                self._op_device_key: prev_device,
-                                self._op_role_key: self._op_role.Backward,
+                                self._op_device_key: prev_dev,
+                                self._op_role_key: new_op_role,
                                 'ring_id': ring_id,
                             })
-                        extra_index += 1
+                        if int(op_role) == int(self._op_role.Forward):
+                            extra_index_info['index'] += 1
                         var_shape = list(var.shape)
                         var_shape[0] = self.micro_batch_size if var_shape[
                             0] < 0 else var_shape[0]
                         block._insert_op(
-                            index=index + extra_index,
+                            index=index + extra_index_info['index'],
                             type='recv_v2',
                             outputs={'Out': [var]},
                             attrs={
                                 'out_shape': var_shape,
                                 'dtype': var.dtype,
-                                self._op_device_key: cur_device,
+                                self._op_device_key: cur_dev,
                                 self._op_role_key: op_role,
                                 'use_calc_stream': True,
                                 'peer': 0,
                                 'ring_id': ring_id
                             })
-                        extra_index += 1
+                        extra_index_info['index'] += 1
                     else:
                         raise ValueError(
                             "Now only 'F-then-B' and '1F1B' are supported."
                             "The given value is {}.".format(self.schedule_mode))
 
+                _insert_send_recv(
+                    int(cur_device.split(':')[1]),
+                    int(prev_device.split(':')[1]))
+        block._sync_with_cpp()
+
     def _insert_loss_scale(self, block):
         """
         Scale the loss corresponding to number of micro-batches.
@@ -4675,6 +4724,23 @@ def _is_regularization_op(self, op):
         return op.desc.has_attr("op_namescope") \
             and op.desc.attr("op_namescope").startswith("/regularization")
 
+    def _get_input_output_info(self, block):
+        '''
+        Get info of op input and output.
+        '''
+        # A map from output var to op which generate it.
+        self.output_var_to_op = dict()
+        # A map from var to op which takes it as input.
+        self.input_var_to_op = dict()
+
+        for index, op in enumerate(list(block.ops)):
+            for var_name in op.input_arg_names:
+                ops = self.input_var_to_op.setdefault(var_name, [])
+                ops.append([op, index])
+            for var_name in op.output_arg_names:
+                ops = self.output_var_to_op.setdefault(var_name, [])
+                ops.append([op, index])
+
     def minimize(self,
                  loss,
                  startup_program=None,
@@ -4682,30 +4748,35 @@ def minimize(self,
                  no_grad_set=None):
         main_block = loss.block
         self.origin_main_block = main_block
+        main_program = main_block.program
         if startup_program is None:
             startup_program = default_startup_program()
-        optimize_ops, params_grads = self._optimizer.minimize(
-            loss, startup_program, parameter_list, no_grad_set)
-        self._param_device_map = self._origin_optimizer._param_device_map
-        assert main_block.program._pipeline_opt \
-            and 'local_rank' in main_block.program._pipeline_opt, \
-            'Please use pipeline with fleet.'
-        local_rank = main_block.program._pipeline_opt['local_rank']
-        self._global_ring_id = main_block.program._pipeline_opt[
-            'global_ring_id']
-        schedule_mode = 0
-        if 'schedule_mode' in main_block.program._pipeline_opt:
-            schedule_mode = main_block.program._pipeline_opt['schedule_mode']
-        self.schedule_mode = schedule_mode
-        # micro batch size
+
+        assert main_program._pipeline_opt, 'Please use pipeline with fleet.'
+        required_keys = [
+            'local_rank',
+            'schedule_mode',
+            'micro_batch_size',
+            'ring_id',
+            'global_ring_id',
+            'use_sharding',
+        ]
+        for key in required_keys:
+            assert key in main_program._pipeline_opt, \
+                'Please use pipeline with fleet to use {}.'.format(key)
+        self.local_rank = main_block.program._pipeline_opt['local_rank']
+        self.schedule_mode = main_block.program._pipeline_opt['schedule_mode']
         self.micro_batch_size = main_block.program._pipeline_opt[
             'micro_batch_size']
-
-        self.use_sharding = False
-        if 'use_sharding' in main_block.program._pipeline_opt:
-            self.use_sharding = main_block.program._pipeline_opt['use_sharding']
+        self.use_sharding = main_block.program._pipeline_opt['use_sharding']
         self.ring_id = main_block.program._pipeline_opt['ring_id']
+        self.global_ring_id = main_block.program._pipeline_opt['global_ring_id']
+
+        optimize_ops, params_grads = self._optimizer.minimize(
+            loss, startup_program, parameter_list, no_grad_set)
+        self._param_device_map = self._origin_optimizer._param_device_map
 
+        self._get_input_output_info(main_block)
         # Step1: add default op_device attribute for ops.
         self._add_op_device_attr(main_block)
         device_list = self._check_validation(main_block)
@@ -4736,26 +4807,27 @@ def device_cmp(device1, device2):
 
         # Step4: Special Case: process persistable vars that exist in
         # multiple sections
-        self._process_persistable_vars_in_multi_sections(
-            main_program, startup_program, program_list)
+        # FIXME 
+        # self._process_persistable_vars_in_multi_sections(
+        #     main_program, startup_program, program_list)
 
         # Step5: Add sub blocks for section programs
         self._add_sub_blocks(main_block, program_list)
 
-        local_rank = main_program._pipeline_opt['local_rank'] % len(device_list)
+        self.local_rank %= len(device_list)
         place_list = []
         for dev in device_list:
             dev_index = int(dev.split(":")[1])
-            place_list.append(core.CUDAPlace(dev_index % 8))
+            place_list.append(core.CUDAPlace(0))
 
         # Step6: Split startup program
         new_startup_program = self._split_startup_program(startup_program,
-                                                          local_rank)
+                                                          self.local_rank)
 
         startup_program._pipeline_opt = {
             "startup_program": new_startup_program,
         }
-        real_block = program_list[local_rank].global_block()
+        real_block = program_list[self.local_rank].global_block()
         self._insert_loss_scale(real_block)
         if not self.use_sharding:
             # Step7: clear gradients before each mini-batch and 
@@ -4769,12 +4841,12 @@ def device_cmp(device1, device2):
         main_program._pipeline_opt = {
             "trainer": "PipelineTrainer",
             "device_worker": "Section",
-            "pipeline_stage": local_rank,
+            "pipeline_stage": self.local_rank,
             "num_pipeline_stages": len(device_list),
             "schedule_mode": self.schedule_mode,
             "inner_parallelism": len(device_list),
-            "section_program": program_list[local_rank],
-            "place": place_list[local_rank],
+            "section_program": program_list[self.local_rank],
+            "place": place_list[self.local_rank],
             "place_id": place_id,
             "sync_steps": -1,
             "num_microbatches": self._num_microbatches,
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 0c292d355ddc0b..e1c5ae750d9b36 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -19,6 +19,8 @@ list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
 list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
 list(APPEND DIST_TEST_OPS test_gen_nccl_id_op)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_control_flow)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_dataparallel)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -38,6 +40,8 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleetrun)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_run_random_port)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_async)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_cloud)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_ascend)
+list(APPEND MIXED_DIST_TEST_OPS test_ascend_group)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_nproc)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input)
 list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer)
@@ -82,6 +86,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_collective_scatter_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_barrier_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_allreduce_api)
+    LIST(REMOVE_ITEM TEST_OPS test_new_group_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_broadcast_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_allgather_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_wait)
@@ -159,6 +164,8 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_control_flow)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single)
 elseif(WITH_GPU)
@@ -177,6 +184,7 @@ endif()
 
 if ((NOT WITH_NCCL) AND (NOT WITH_RCCL))
     list(REMOVE_ITEM TEST_OPS test_imperative_group)
+    LIST(REMOVE_ITEM TEST_OPS test_new_group_api)
 endif()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
@@ -525,6 +533,10 @@ if(WITH_DISTRIBUTE)
         bash_test_modules(test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         bash_test_modules(test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        if(WITH_ASCEND)
+            bash_test_modules(test_fleet_launch_ascend START_BASH test_fleet_launch_ascend.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+            bash_test_modules(test_ascend_group START_BASH test_ascend_group.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        endif()
 
         # port range (20000, 23000) is reserved for dist-ops
         set(dist_ut_port 20001)
@@ -535,7 +547,9 @@ if(WITH_DISTRIBUTE)
                 message(FATAL_ERROR "available ports have been exhausted:${dist_ut_port}")
             endif()
         endforeach(TEST_OP)
-        bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
+        # solve it later.
+        # bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
+        bash_test_modules(test_new_group START_BASH test_new_group.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
     endif(NOT APPLE)
 endif()
 
@@ -719,7 +733,7 @@ if (WIN32)
     set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 250)
 else()
     set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 600)
-    set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 150)
+    set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 250)
 endif()
 set_tests_properties(test_imperative_selected_rows_to_lod_tensor PROPERTIES TIMEOUT 120)
 set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
@@ -821,16 +835,19 @@ set_tests_properties(test_dataloader_unkeep_order PROPERTIES TIMEOUT 120)
 set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120)
 set_tests_properties(test_pool3d_api PROPERTIES TIMEOUT 120)
 if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
+    set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 120)
     if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
         set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120)
         set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120)
-        set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
     endif()
 endif()
 if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
     set_tests_properties(test_collective_allgather_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_new_group_api PROPERTIES TIMEOUT 120)
     if(WITH_DISTRIBUTE)
         set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
     endif()
@@ -853,6 +870,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
         test_collective_barrier_api
         test_collective_reduce_api
         test_collective_allreduce_api
+        test_new_group_api
         test_collective_broadcast_api
         test_collective_allgather_api
         PROPERTIES LABELS "RUN_TYPE=DIST")
diff --git a/python/paddle/fluid/tests/unittests/ascend_group.py b/python/paddle/fluid/tests/unittests/ascend_group.py
new file mode 100644
index 00000000000000..78a3687b5ca3cd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ascend_group.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+import paddle.fluid as fluid
+from paddle.fluid import unique_name
+import paddle.fluid.core as core
+import paddle
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_optimizers.ascend import ascend_parser, ascend_optimizer
+from collections import namedtuple
+
+Block = namedtuple('Block', ['program'])
+Loss = namedtuple('Loss', ['block'])
+
+paddle.enable_static()
+
+OpRole = core.op_proto_and_checker_maker.OpRole
+OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
+OP_ROLE_VAR_KEY = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
+
+role = fleet.PaddleCloudRoleMaker(is_collective=True)
+fleet.init(role)
+
+
+def init_communicator(startup_program, main_program, current_endpoint,
+                      endpoints, ring_id):
+    nranks = len(endpoints)
+    other_endpoints = endpoints[:]
+    other_endpoints.remove(current_endpoint)
+    group_rank = endpoints.index(current_endpoint)
+    assert group_rank >= 0
+
+    block = startup_program.global_block()
+    nccl_id_var = block.create_var(
+        name=unique_name.generate('nccl_id'),
+        persistable=True,
+        type=core.VarDesc.VarType.RAW)
+    block.append_op(
+        type='c_gen_nccl_id',
+        inputs={},
+        outputs={'Out': nccl_id_var},
+        attrs={
+            'rank': group_rank,
+            'endpoint': current_endpoint,
+            'other_endpoints': other_endpoints,
+            OP_ROLE_KEY: OpRole.Forward,
+        })
+    block.append_op(
+        type='c_comm_init',
+        inputs={'X': nccl_id_var},
+        outputs={},
+        attrs={
+            'nranks': nranks,
+            'rank': group_rank,
+            'ring_id': ring_id,
+            OP_ROLE_KEY: OpRole.Forward,
+        })
+
+    with fluid.program_guard(main_program):
+        op_type = "c_allreduce_sum"
+        data = fluid.layers.fill_constant(shape=[1], dtype='float32', value=2.5)
+        helper = LayerHelper(op_type, **locals())
+        helper.append_op(
+            type=op_type,
+            inputs={'X': [data]},
+            outputs={'Out': [data]},
+            attrs={'ring_id': ring_id,
+                   'use_calc_stream': True})
+
+    print("startup program:", startup_program)
+    print("main program:", main_program)
+
+
+def train(world_endpoints, world_device_ids, local_device_ids, local_rank):
+    startup_programs = []
+    main_programs = []
+
+    #trainer_endpoints=["127.0.0.1:6071","127.0.0.1:6072","127.0.0.1:6073","127.0.0.1:6074"]
+    trainer_endpoints = world_endpoints
+    groups = [[], [], []]
+    groups[0] = [trainer_endpoints[0], trainer_endpoints[1]]
+    groups[1] = [trainer_endpoints[2], trainer_endpoints[3]]
+    groups[2] = [trainer_endpoints[0], trainer_endpoints[2]]
+    print("groups:", groups)
+
+    for i in range(len(trainer_endpoints)):
+        startup_programs.append(fluid.Program())
+        main_programs.append(fluid.Program())
+
+    for idx, group in enumerate(groups):
+        for te in group:
+            te_idx = trainer_endpoints.index(te)
+            startup_program = startup_programs[te_idx]
+            main_program = main_programs[te_idx]
+            init_communicator(startup_program, main_program, te, group, idx)
+
+    print(len(startup_programs))
+    print(startup_programs[local_rank])
+    print(main_programs[local_rank])
+
+    print("local rank: ", local_rank)
+    print("local startup program: ", startup_programs[local_rank])
+
+    startup_program = startup_programs[local_rank]
+    main_program = main_programs[local_rank]
+    loss = Loss(Block(main_program))
+    optimizer = ascend_optimizer.AscendOptimizer(None, fetch_list=[])
+    optimizer.minimize(loss, startup_program, auto_dp=True)
+
+    exe = paddle.static.Executor(paddle.CPUPlace())
+    #exe.run(startup_program)
+    exe.run(main_program)
+
+
+worker_endpoints = fleet.worker_endpoints()
+world_device_ids = fleet.world_device_ids()
+local_device_ids = fleet.local_device_ids()
+local_rank = int(fleet.local_rank())
+
+print("worker_endpoints:", worker_endpoints)
+print("world_device_ids:", world_device_ids)
+print("local_device_ids:", local_device_ids)
+print("local_rank:", local_rank)
+
+train(worker_endpoints, world_device_ids, local_device_ids, local_rank)
diff --git a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
new file mode 100644
index 00000000000000..33e6f63ea10ced
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+
+
+def train(prefix):
+    selected_accelerators = os.getenv("FLAGS_selected_accelerators")
+    trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+    worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
+    current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
+    worker_endpoints = worker_endpoints_env
+    trainers_num = len(worker_endpoints.split(','))
+    device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS")
+    current_device_id = os.getenv("PADDLE_LOCAL_DEVICE_IDS")
+
+    details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\
+            .format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id)
+
+    print(details)
+    with open("multi_process_{}.check_{}.log".format(prefix, trainer_id),
+              "w") as f:
+        f.write(details)
+
+
+if __name__ == '__main__':
+    prefix = sys.argv[1]
+    train(prefix)
diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py b/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py
new file mode 100644
index 00000000000000..597765cfb9811c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduceNewGroupAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            gp = paddle.distributed.new_group([0, 1])
+            paddle.distributed.all_reduce(
+                tindata, group=gp, use_calc_stream=False)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllreduceNewGroupAPI, "allreduce")
diff --git a/python/paddle/fluid/tests/unittests/dist_sharding_save.py b/python/paddle/fluid/tests/unittests/dist_sharding_save.py
index 22c930bf8948aa..676b15c0d93e76 100755
--- a/python/paddle/fluid/tests/unittests/dist_sharding_save.py
+++ b/python/paddle/fluid/tests/unittests/dist_sharding_save.py
@@ -59,7 +59,11 @@ def runtime_main():
 
             strategy = paddle.distributed.fleet.DistributedStrategy()
             strategy.sharding = True
-            strategy.sharding_configs = {"fuse_broadcast_MB": 0.2}
+            strategy.sharding_configs = {
+                "sharding_segment_strategy": "segment_broadcast_MB",
+                "segment_broadcast_MB": 0.2,
+                "sharding_degree": 2,
+            }
 
             optimizer = paddle.fluid.optimizer.Momentum(
                 learning_rate=0.01, momentum=0.9)
diff --git a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
index 1c74a11cc4d2e6..549975f5d3f0f4 100755
--- a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
+++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
@@ -146,7 +146,11 @@ def set_strategy(self, strategy, name):
             strategy.gradient_merge_configs = {"k_steps": 2, "avg": True}
         elif name == "sharding":
             strategy.sharding = True
-            strategy.sharding_configs = {"fuse_broadcast_MB": 0.2}
+            strategy.sharding_configs = {
+                "sharding_segment_strategy": "segment_broadcast_MB",
+                "segment_broadcast_MB": 0.2,
+                "sharding_degree": 2,
+            }
         elif name == "recompute-offload":
             strategy.recompute = True
             strategy.recompute_configs = {
diff --git a/python/paddle/fluid/tests/unittests/hybrid_communicate_group.py b/python/paddle/fluid/tests/unittests/hybrid_communicate_group.py
new file mode 100644
index 00000000000000..0a9785475b561a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_communicate_group.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+import paddle
+from paddle.distributed import fleet
+
+
+class TestNewGroupAPI(object):
+    def __init__(self):
+        paddle.distributed.init_parallel_env()
+        topo = fleet.CommunicateTopology(["data", "model", "pipe"], [2, 1, 1])
+        self.hcg = fleet.HybridCommunicateGroup(topo)
+
+        d1 = np.array([1, 2, 3])
+        d2 = np.array([2, 3, 4])
+        self.tensor1 = paddle.to_tensor(d1)
+        self.tensor2 = paddle.to_tensor(d2)
+
+    def test_all(self):
+        topo = self.hcg.topology()
+        global_rank = self.hcg.get_data_parallel_rank()
+
+        dp_rank = self.hcg.get_data_parallel_rank()
+        dp_gp = self.hcg.get_data_parallel_group()
+        dp_world_size = self.hcg.get_data_parallel_world_size()
+        dp_src_rank = self.hcg.get_data_parallel_group_src_rank()
+        np.testing.assert_array_equal(dp_world_size, 2)
+        np.testing.assert_array_equal(dp_src_rank, 0)
+
+        mp_rank = self.hcg.get_model_parallel_rank()
+        mp_gp = self.hcg.get_model_parallel_group()
+        mp_world_size = self.hcg.get_model_parallel_world_size()
+        mp_src_rank = self.hcg.get_model_parallel_group_src_rank()
+        np.testing.assert_array_equal(mp_world_size, 1)
+
+        tmp = np.array([0, 0, 0])
+        result = paddle.to_tensor(tmp)
+        paddle.distributed.scatter(
+            result, [self.tensor2, self.tensor1],
+            src=dp_src_rank,
+            group=dp_gp,
+            use_calc_stream=True)
+        if dp_rank == 0:
+            assert np.array_equal(result, self.tensor2)
+        elif dp_rank == 1:
+            assert np.array_equal(result, self.tensor1)
+        print("test scatter api ok")
+
+        paddle.distributed.broadcast(
+            result, src=1, group=dp_gp, use_calc_stream=True)
+        assert np.array_equal(result, self.tensor1)
+        print("test broadcast api ok")
+
+        paddle.distributed.reduce(
+            result, dst=dp_src_rank, group=dp_gp, use_calc_stream=True)
+        if dp_rank == 0:
+            assert np.array_equal(result,
+                                  paddle.add(self.tensor1, self.tensor1))
+        elif dp_rank == 1:
+            assert np.array_equal(result, self.tensor1)
+        print("test reduce api ok")
+
+        paddle.distributed.all_reduce(result, use_calc_stream=True)
+        assert np.array_equal(
+            result,
+            paddle.add(paddle.add(self.tensor1, self.tensor1), self.tensor1))
+        print("test all_reduce api ok")
+
+        paddle.distributed.wait(result, dp_gp, use_calc_stream=True)
+        paddle.distributed.wait(result, dp_gp, use_calc_stream=False)
+        print("test wait api ok")
+
+        result = []
+        paddle.distributed.all_gather(
+            result, self.tensor1, group=dp_gp, use_calc_stream=True)
+        assert np.array_equal(result[0], self.tensor1)
+        assert np.array_equal(result[1], self.tensor1)
+        print("test all_gather api ok")
+
+        paddle.distributed.barrier(group=dp_gp)
+        print("test barrier api ok")
+
+        return
+
+
+if __name__ == "__main__":
+    gpt = TestNewGroupAPI()
+    gpt.test_all()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
index 95cff4de6f6b08..69a9ae3c0ad2c9 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
@@ -40,9 +40,11 @@ def setUp(self):
             matmul_ab_square = paddle.square(matmul_ab)
             matmul_square_ab = paddle.matmul(data_a_square, data_b_square)
 
-            scale = paddle.fluid.layers.fill_constant(shape=[1], value=0.5, dtype='float32')
+            scale = paddle.fluid.layers.fill_constant(
+                shape=[1], value=0.5, dtype='float32')
 
-            sub_val = paddle.fluid.layers.elementwise_sub(matmul_ab_square, matmul_square_ab)
+            sub_val = paddle.fluid.layers.elementwise_sub(matmul_ab_square,
+                                                          matmul_square_ab)
             squared_mat_sub_out = fluid.layers.elementwise_mul(sub_val, scale)
 
         self.feeds = {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py
new file mode 100644
index 00000000000000..1d6f1c2c45910d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import itertools
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTAnchorGeneratorBaseTest(InferencePassTest):
+    def setUp(self):
+        self.bs = 1
+        self.channel = 16
+        self.height = 32
+        self.width = 32
+        self.anchor_sizes = [64., 128., 256., 512.]
+        self.aspect_ratios = [.5, 1., 2.]
+        self.variance = [.1, .1, .2, .2]
+        self.stride = [8., 8.]
+        self.precision = AnalysisConfig.Precision.Float32
+        self.serialize = False
+        self.enable_trt = True
+        self.feeds = {
+            'data':
+            np.random.random([self.bs, self.channel, self.height,
+                              self.width]).astype('float32'),
+        }
+
+    def build(self):
+        min_graph_size = 3 if self.dynamic_shape_params is not None else 2
+        self.trt_parameters = InferencePassTest.TensorRTParam(
+            1 << 30, self.bs, min_graph_size, self.precision, self.serialize,
+            False)
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name='data',
+                shape=[-1, self.channel, self.height, self.width],
+                dtype='float32')
+            anchor, var = fluid.layers.detection.anchor_generator(
+                data,
+                anchor_sizes=self.anchor_sizes,
+                aspect_ratios=self.aspect_ratios,
+                variance=self.variance,
+                stride=self.stride)
+            if self.dynamic_shape_params is not None:
+                anchor = fluid.layers.transpose(anchor, [2, 3, 0, 1])
+            out = fluid.layers.batch_norm(anchor, is_test=True)
+
+        self.fetch_list = [out, var]
+
+    def run_test(self):
+        self.build()
+        self.check_output()
+
+    def set_dynamic(self):
+        self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({
+            'data': [self.bs, self.channel, self.height // 2, self.width // 2]
+        }, {
+            'data': [self.bs, self.channel, self.height, self.width]
+        }, {'data': [self.bs, self.channel, self.height, self.width]}, False)
+
+    def test_base(self):
+        self.run_test()
+
+    def test_fp16(self):
+        self.precision = AnalysisConfig.Precision.Half
+        self.run_test()
+
+    def test_serialize(self):
+        self.serialize = True
+        self.run_test()
+
+    def test_dynamic(self):
+        self.set_dynamic()
+        self.run_test()
+
+    def test_dynamic_fp16(self):
+        self.precision = AnalysisConfig.Precision.Half
+        self.set_dynamic()
+        self.run_test()
+
+    def test_dynamic_serialize(self):
+        self.serialize = True
+        self.set_dynamic()
+        self.run_test()
+
+    def test_dynamic_fp16_serialize(self):
+        self.serialize = True
+        self.precision = AnalysisConfig.Precision.Half
+        self.set_dynamic()
+        self.run_test()
+
+    def check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            atol = 1e-5
+            if self.trt_parameters.precision == AnalysisConfig.Precision.Half:
+                atol = 1e-3
+            self.check_output_with_option(use_gpu, atol, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
index 94434f40434489..080d1ccc9054bc 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
@@ -25,19 +25,16 @@ class TensorRTMatMulDims2Test(InferencePassTest):
     def setUp(self):
         self.set_params()
         with fluid.program_guard(self.main_program, self.startup_program):
-            data = fluid.data(
-                name="data", shape=[24, 24], dtype="float32")
+            data = fluid.data(name="data", shape=[24, 24], dtype="float32")
             matmul_out = fluid.layers.matmul(
                 x=data,
                 y=data,
-                transpose_x = self.transpose_x,
-                transpose_y = self.transpose_y,
-                alpha = self.alpha)
+                transpose_x=self.transpose_x,
+                transpose_y=self.transpose_y,
+                alpha=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
 
-        self.feeds = {
-            "data": np.ones([24, 24]).astype("float32"),
-        }
+        self.feeds = {"data": np.ones([24, 24]).astype("float32"), }
         self.enable_trt = True
         self.trt_parameters = TensorRTMatMulDims2Test.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
@@ -65,14 +62,12 @@ def setUp(self):
             matmul_out = fluid.layers.matmul(
                 x=data,
                 y=data,
-                transpose_x = self.transpose_x,
-                transpose_y = self.transpose_y,
-                alpha = self.alpha)
+                transpose_x=self.transpose_x,
+                transpose_y=self.transpose_y,
+                alpha=self.alpha)
             out = fluid.layers.batch_norm(matmul_out, is_test=True)
 
-        self.feeds = {
-            "data": np.ones([1, 6, 24, 24]).astype("float32"),
-        }
+        self.feeds = {"data": np.ones([1, 6, 24, 24]).astype("float32"), }
         self.enable_trt = True
         self.trt_parameters = TensorRTMatMulTest.TensorRTParam(
             1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
diff --git a/python/paddle/fluid/tests/unittests/new_group.py b/python/paddle/fluid/tests/unittests/new_group.py
new file mode 100644
index 00000000000000..fb7beeee1df2e8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/new_group.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+import paddle
+
+
+class TestNewGroupAPI(object):
+    def __init__(self):
+        paddle.distributed.init_parallel_env()
+        d1 = np.array([1, 2, 3])
+        d2 = np.array([2, 3, 4])
+        self.tensor1 = paddle.to_tensor(d1)
+        self.tensor2 = paddle.to_tensor(d2)
+
+    def test_all(self):
+        gp = paddle.distributed.new_group([0, 1])
+        print("test new group api ok")
+
+        tmp = np.array([0, 0, 0])
+        result = paddle.to_tensor(tmp)
+        paddle.distributed.scatter(
+            result, [self.tensor2, self.tensor1],
+            src=0,
+            group=gp,
+            use_calc_stream=True)
+        if gp.rank == 0:
+            assert np.array_equal(result, self.tensor2)
+        elif gp.rank == 1:
+            assert np.array_equal(result, self.tensor1)
+        print("test scatter api ok")
+
+        paddle.distributed.broadcast(
+            result, src=1, group=gp, use_calc_stream=True)
+        assert np.array_equal(result, self.tensor1)
+        print("test broadcast api ok")
+
+        paddle.distributed.reduce(result, dst=0, group=gp, use_calc_stream=True)
+        if gp.rank == 0:
+            assert np.array_equal(result,
+                                  paddle.add(self.tensor1, self.tensor1))
+        elif gp.rank == 1:
+            assert np.array_equal(result, self.tensor1)
+        print("test reduce api ok")
+
+        paddle.distributed.all_reduce(result, use_calc_stream=True)
+        assert np.array_equal(
+            result,
+            paddle.add(paddle.add(self.tensor1, self.tensor1), self.tensor1))
+        print("test all_reduce api ok")
+
+        paddle.distributed.wait(result, gp, use_calc_stream=True)
+        paddle.distributed.wait(result, gp, use_calc_stream=False)
+        print("test wait api ok")
+
+        result = []
+        paddle.distributed.all_gather(
+            result, self.tensor1, group=gp, use_calc_stream=True)
+        assert np.array_equal(result[0], self.tensor1)
+        assert np.array_equal(result[1], self.tensor1)
+        print("test all_gather api ok")
+
+        paddle.distributed.barrier(group=gp)
+        print("test barrier api ok")
+
+        return
+
+
+if __name__ == "__main__":
+    gpt = TestNewGroupAPI()
+    gpt.test_all()
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py
new file mode 100644
index 00000000000000..26c9944abd6c6c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import paddle.distributed as dist
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Embedding
+import paddle.nn.functional as F
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+paddle.seed(123)
+np.random.seed(2021)
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self, hidden_size, vocab_size, is_sparse=False):
+        super(SimpleNet, self).__init__()
+        self.hidden_size = hidden_size
+        self.vocab_size = vocab_size
+        self.embedding = Embedding(
+            size=[self.vocab_size, self.hidden_size],
+            dtype='float32',
+            is_sparse=is_sparse)
+
+        self.lin_a = paddle.nn.Linear(self.hidden_size, self.vocab_size)
+        self.lin_b = paddle.nn.Linear(self.vocab_size, 1)
+
+        self.unused_net = paddle.nn.Linear(5, 3)
+        self.phony = self.create_parameter(shape=[1], dtype="float32")
+
+    def forward(self, input, label, conf):
+        x_emb = self.embedding(input)
+        fc = self.lin_a(x_emb)
+        mask = conf > 0
+        mask = paddle.cast(mask, dtype="int64")
+        mask.stop_gradient = True
+        emb_mask = mask.max(1).flatten()
+        emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten()
+        emb_mask_inds.stop_gradient = True
+
+        if emb_mask_inds.numel() == 0:
+            loss_box = self.phony * 0
+        else:
+            projection = self.lin_b(fc)
+            projection = paddle.reshape(projection, shape=[-1, 1])
+            output = paddle.gather(projection, emb_mask_inds)
+            target = paddle.gather(label, emb_mask_inds)
+            loss_box = F.smooth_l1_loss(
+                output, target, reduction='sum', delta=1.0)
+            loss_box = loss_box / len(conf)
+
+        return loss_box
+
+
+# global configs
+batch_size = 4
+batch_num = 2000
+hidden_size = 5
+vocab_size = 100
+
+conf_dataset = [[0], [0], [0], [0], [1], [0], [1], [0], [0], [1], [0], [1],
+                [1], [1], [1], [1], [1], [1], [1], [1], [1], [0], [0], [1]]
+
+
+def fake_sample_reader():
+    def __reader__():
+        for i in range(batch_num):
+            x_data = np.random.randint(0, vocab_size)
+            y_data = np.random.random_sample((1, )).astype('float32')
+            conf_data = np.array(conf_dataset[i % len(conf_dataset)]).astype(
+                'int64')
+            yield x_data, y_data, conf_data
+
+    return __reader__
+
+
+class TestSimpleNet(TestParallelDyGraphRunnerBase):
+    def get_model(self):
+        model = SimpleNet(
+            hidden_size=hidden_size, vocab_size=vocab_size, is_sparse=False)
+
+        train_reader = paddle.batch(
+            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
+                                         parameters=model.parameters())
+
+        return model, train_reader, optimizer
+
+    def run_one_loop(self, model, optimizer, batch):
+        x_data = np.array([x[0] for x in batch]).astype('int64')
+        y_data = np.array([x[1] for x in batch]).astype('float32')
+        conf_data = np.array([x[2] for x in batch]).astype('int64')
+        x_data = x_data.reshape((-1, 1))
+        y_data = y_data.reshape((-1, 1))
+        conf_data = conf_data.reshape((-1, 1))
+
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        conf = paddle.to_tensor(conf_data)
+
+        loss = model(x, y, conf)
+        return loss
+
+
+if __name__ == "__main__":
+    runtime_main(TestSimpleNet)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py
new file mode 100644
index 00000000000000..3157d5e4129eeb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import contextlib
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dygraph
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.dygraph.nn import Linear
+from paddle.fluid.dygraph.base import to_variable
+
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+np.random.seed(2021)
+paddle.seed(1024)
+
+batch_size = 4
+batch_num = 1000
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.net_a = paddle.nn.Sequential(
+            paddle.nn.Linear(10, 20),
+            paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5))
+        self.net_b = paddle.nn.Sequential(
+            paddle.nn.Linear(10, 20),
+            paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5))
+        self.net_unused = Linear(10, 20)
+        self.step = 0
+
+    def forward(self, x):
+        if self.step % 2 == 0:
+            return self.net_a(x)
+        else:
+            return self.net_b(x)
+
+        self.step = self.step + 1
+
+
+def fake_sample_reader():
+    def __reader__():
+        for i in range(batch_num):
+            x_data = np.random.random_sample((10, )).astype('float32')
+            yield x_data
+
+    return __reader__
+
+
+class TestSimpleNet(TestParallelDyGraphRunnerBase):
+    def get_model(self):
+        model = SimpleNet()
+        train_reader = paddle.batch(
+            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
+                                         parameters=model.parameters())
+        return model, train_reader, optimizer
+
+    def run_one_loop(self, model, optimizer, batch):
+        x_data = np.array([x for x in batch])
+        x_data = x_data.reshape((-1, 10))
+        x = to_variable(x_data)
+        out = model(x)
+        loss = out.sum() / len(batch)
+        return loss
+
+
+if __name__ == "__main__":
+    runtime_main(TestSimpleNet)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
new file mode 100644
index 00000000000000..0d2631fa108d28
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import numpy as np
+import paddle.distributed as dist
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+
+paddle.seed(1024)
+np.random.seed(2021)
+
+batch = 5
+in_dim = 10
+out_dim = 20
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self, train_id):
+        super(SimpleNet, self).__init__()
+        self.w1 = self.create_parameter(
+            shape=[in_dim, out_dim], dtype="float32")
+        self.w2 = self.create_parameter(
+            shape=[in_dim, out_dim], dtype="float32")
+        self.share_net = Linear(out_dim, 10)
+
+        self.unused_param = self.create_parameter(
+            shape=[out_dim, in_dim], dtype="float64")
+
+        # just for test sync_params_buffers
+        self.register_buffer("queue", paddle.randn([10, 5]))
+        self.queue = paddle.nn.functional.normalize(self.queue, axis=0)
+        self.register_buffer("queue_ptr", paddle.zeros([1], 'int64'))
+
+        self.trainer_id = train_id
+
+    def forward(self, x):
+        is_use = (paddle.equal_all(
+            x, paddle.ones(shape=(batch, in_dim))).numpy()[0] and
+                  self.trainer_id == 1)
+
+        if is_use:
+            tmp = paddle.matmul(x, self.w1)
+        else:
+            tmp = paddle.matmul(x, self.w2)
+
+        return self.share_net(tmp)
+
+
+class TestDistTraning(unittest.TestCase):
+    def test_multiple_gpus(self):
+        dist.init_parallel_env()
+        self.trainer_id = dist.get_rank()
+
+        model_a = SimpleNet(self.trainer_id)
+        model_b = SimpleNet(self.trainer_id)
+
+        state_dict = model_a.state_dict()
+        model_b.set_state_dict(state_dict)
+
+        model_a = paddle.DataParallel(model_a)
+        model_b = paddle.DataParallel(model_b)
+
+        ones_input = paddle.ones(shape=(batch, in_dim))
+        ones_input.stop_gradient = True
+
+        w1_grad_sum = np.zeros((in_dim, out_dim), dtype='float32')
+        w2_grad_sum = np.zeros((in_dim, out_dim), dtype='float32')
+
+        for step_id in range(5):
+            random_input = paddle.rand(shape=(batch, in_dim))
+            random_input.stop_gradient = True
+
+            if step_id % 2 == 0:
+                out_a = model_a(random_input)
+                out_b = model_b(random_input)
+            else:
+                out_a = model_a(ones_input)
+                out_b = model_b(ones_input)
+
+            out_a.sum().backward()
+            out_b.sum().backward()
+
+            self.check_gradient(model_a.parameters())
+            self.check_gradient(model_b.parameters())
+
+            # test acc gradient
+            w1_grad_sum = self.check_acc(model_a._layers.w1.grad, w1_grad_sum,
+                                         model_b._layers.w1.grad)
+            w2_grad_sum = self.check_acc(model_a._layers.w2.grad, w2_grad_sum,
+                                         model_b._layers.w2.grad)
+
+            model_a.clear_gradients()
+
+    def check_acc(self, grad, grad_sum, acc_grad):
+        if grad is not None:
+            grad_sum = grad_sum + grad
+            np.testing.assert_allclose(grad_sum, acc_grad, rtol=1e-6)
+        return grad_sum
+
+    def print_trainer_0(self, *args):
+        if self.trainer_id == 0:
+            print(*args)
+
+    def broadcast_param(self, param, root):
+        paddle.distributed.broadcast(param, root)
+        return param
+
+    def check_gradient(self, params):
+        other_param = []
+        for param in params:
+            if param.trainable and (param._grad_ivar() is not None):
+                grad = param._grad_ivar()
+                other_grad = self.broadcast_param(grad.clone(), root=1)
+                if self.trainer_id == 0:
+                    np.testing.assert_allclose(other_grad.numpy(), grad.numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py
new file mode 100644
index 00000000000000..fc0246a9720bfd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import contextlib
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dygraph
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.dygraph.nn import Linear
+
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+np.random.seed(2021)
+paddle.seed(1024)
+
+batch_size = 4
+batch_num = 1000
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self):
+        super(SimpleNet, self).__init__()
+        self.net_a = paddle.nn.Sequential(
+            paddle.nn.Linear(10, 20),
+            paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5))
+        self.net_b = paddle.nn.Sequential(
+            paddle.nn.Linear(10, 20),
+            paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5))
+        self.step = 0
+
+    def forward(self, x):
+        return paddle.to_tensor(0.0, dtype='float32')
+
+
+def fake_sample_reader():
+    def __reader__():
+        for i in range(batch_num):
+            x_data = np.random.random_sample((10, )).astype('float32')
+            yield x_data
+
+    return __reader__
+
+
+class TestSimpleNet(TestParallelDyGraphRunnerBase):
+    def get_model(self):
+        model = SimpleNet()
+        train_reader = paddle.batch(
+            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
+                                         parameters=model.parameters())
+        return model, train_reader, optimizer
+
+    def run_one_loop(self, model, optimizer, batch):
+        x_data = np.array([x for x in batch])
+        x_data = x_data.reshape((-1, 10))
+        x = paddle.to_tensor(x_data)
+        out = model(x)
+        loss = out.sum() / len(batch)
+        return loss
+
+
+if __name__ == "__main__":
+    runtime_main(TestSimpleNet)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py
new file mode 100644
index 00000000000000..facac33e4c60ec
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear
+from paddle.fluid.dygraph.base import to_variable
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+np.random.seed(2021)
+paddle.seed(1024)
+
+
+class SimpleNet(fluid.Layer):
+    def __init__(self):
+        # bias is unused parameters, and it share with net_a
+        super(SimpleNet, self).__init__()
+        self.net_a = Linear(input_dim=10, output_dim=5)
+        self.net_b = Linear(10, 10)
+        self.bias = self.net_a.bias
+
+    def forward(self, x):
+        return self.net_b(x)
+
+
+batch_size = 4
+batch_num = 1000
+
+
+def fake_sample_reader():
+    def __reader__():
+        for i in range(batch_num):
+            x_data = np.random.random_sample((10, )).astype('float32')
+            yield x_data
+
+    return __reader__
+
+
+class TestSimpleNet(TestParallelDyGraphRunnerBase):
+    def get_model(self):
+        model = SimpleNet()
+        train_reader = paddle.batch(
+            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
+                                         parameters=model.parameters())
+        return model, train_reader, optimizer
+
+    def run_one_loop(self, model, optimizer, batch):
+        x_data = np.array([x for x in batch])
+        x_data = x_data.reshape((-1, 10))
+        x = to_variable(x_data)
+        out = model(x)
+        loss = out.sum() / len(batch)
+        return loss
+
+
+if __name__ == "__main__":
+    runtime_main(TestSimpleNet)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py
index 65c242a7023093..a15b263a295086 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py
@@ -65,8 +65,6 @@ def __init__(self,
     def forward(self, input, label):
         x_emb = self.embedding(input)
         fc = paddle.matmul(x_emb, self.softmax_weight)
-        # use detach to stop gradient
-        fc = fc.detach()
         fc = paddle.add(fc, self.softmax_bias)
         projection = paddle.reshape(fc, shape=[-1, self.vocab_size])
         loss = paddle.nn.functional.softmax_with_cross_entropy(
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py
index 1884eef15e9a40..9f877381101e96 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py
@@ -37,7 +37,7 @@ def __init__(self,
         self.embedding = Embedding(
             self.vocab_size,
             self.hidden_size,
-            sparse=True,
+            sparse=is_sparse,
             weight_attr=paddle.ParamAttr(
                 name='embedding_param',
                 initializer=paddle.nn.initializer.Uniform(
@@ -105,7 +105,7 @@ def get_model(self):
             vocab_size=vocab_size,
             num_steps=num_steps,
             init_scale=init_scale,
-            is_sparse=True)
+            is_sparse=False)
 
         train_reader = paddle.batch(
             fake_sample_reader(), batch_size=batch_size, drop_last=True)
diff --git a/python/paddle/fluid/tests/unittests/parallel_test.sh b/python/paddle/fluid/tests/unittests/parallel_test.sh
index 9da4f035345d7f..551b7cdb7a43c1 100644
--- a/python/paddle/fluid/tests/unittests/parallel_test.sh
+++ b/python/paddle/fluid/tests/unittests/parallel_test.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 unset https_proxy http_proxy
 export FLAGS_rpc_disable_reuse_port=1
 
diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
index e4336ab05d58f7..8277256009e72b 100644
--- a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py
@@ -83,6 +83,8 @@ def initTestCase(self):
         self.output_shape = np.array([20, 2, 5, 7]).astype("int32")
         self.dynamic_shape = True
         self.use_cudnn = True
+        if paddle.fluid.core.is_compiled_with_rocm():
+            self.use_cudnn = False  # ROCM platform do not have MIOPEN kernel for affine_grid
         self.align_corners = True
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_ascend_group.sh b/python/paddle/fluid/tests/unittests/test_ascend_group.sh
new file mode 100644
index 00000000000000..31c442e0962624
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_ascend_group.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+cluster_node_ips="127.0.0.1"
+export PADDLE_TRAINERS_NUM=4
+export POD_IP=127.0.0.1
+export PADDLE_TRAINERS=127.0.0.1
+export PADDLE_TRAINER_ID=0
+
+export PADDLE_PORT=35789
+export TRAINER_PORTS_NUM=4
+
+distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1,2,3 --log_dir=testlog"
+python -m paddle.distributed.fleet.launch ${distributed_args} \
+  ascend_group.py fleetascendgroup
diff --git a/python/paddle/fluid/tests/unittests/test_bce_loss.py b/python/paddle/fluid/tests/unittests/test_bce_loss.py
index 4b39436842b897..ea1a22780f0931 100644
--- a/python/paddle/fluid/tests/unittests/test_bce_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_loss.py
@@ -27,8 +27,10 @@ def test_static_layer(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        input = paddle.fluid.data(name='input', shape=input_np.shape, dtype='float64')
-        label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64')
+        input = paddle.fluid.data(
+            name='input', shape=input_np.shape, dtype='float64')
+        label = paddle.fluid.data(
+            name='label', shape=label_np.shape, dtype='float64')
         if weight_np is not None:
             weight = paddle.fluid.data(
                 name='weight', shape=weight_np.shape, dtype='float64')
@@ -58,8 +60,10 @@ def test_static_functional(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        input = paddle.fluid.data(name='input', shape=input_np.shape, dtype='float64')
-        label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64')
+        input = paddle.fluid.data(
+            name='input', shape=input_np.shape, dtype='float64')
+        label = paddle.fluid.data(
+            name='label', shape=label_np.shape, dtype='float64')
         if weight_np is not None:
             weight = paddle.fluid.data(
                 name='weight', shape=weight_np.shape, dtype='float64')
diff --git a/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
index a6175aa471d693..153b8fd3e7f6b0 100644
--- a/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py
@@ -48,8 +48,10 @@ def test_static(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        logit = paddle.fluid.data(name='logit', shape=logit_np.shape, dtype='float64')
-        label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64')
+        logit = paddle.fluid.data(
+            name='logit', shape=logit_np.shape, dtype='float64')
+        label = paddle.fluid.data(
+            name='label', shape=label_np.shape, dtype='float64')
         feed_dict = {"logit": logit_np, "label": label_np}
 
         pos_weight = None
diff --git a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
index a9d450e223f1e0..aba95a68ab7908 100644
--- a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
+++ b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -e
 # use default values
 # FIXME: random fails on Unknown command lines -c (or -m).
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 83bba0b0ca1c3a..bbb0f5b10393d0 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -1470,35 +1470,59 @@ def run_7():
     not (core.is_compiled_with_cuda() or core.is_compiled_with_rocm()),
     "core is not compiled with CUDA or ROCM")
 class TestConv2DEnviron(unittest.TestCase):
-    def run_conv2d_api(self):
-        inputs = fluid.layers.data(
-            shape=[2, 3, 5, 5],
-            append_batch_size=False,
-            name="inputs",
-            dtype="float32")
-        fluid.layers.conv2d(
-            input=inputs,
-            num_filters=4,
-            filter_size=[3, 3],
-            stride=[1, 1],
-            padding=0,
-            dilation=[1, 1],
-            groups=1,
-            data_format="NCHW")
-
-        x_var = paddle.uniform((2, 3, 5, 5), dtype="float32", min=-1., max=1.)
-        conv = paddle.nn.Conv2D(
-            in_channels=3,
-            out_channels=4,
-            kernel_size=(3, 3),
-            data_format="NCHW")
-        y_var = conv(x_var)
+    def run1(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            inputs = fluid.layers.data(
+                shape=[2, 3, 5, 5],
+                append_batch_size=False,
+                name="inputs",
+                dtype="float32")
+            result = fluid.layers.conv2d(
+                input=inputs,
+                num_filters=4,
+                filter_size=[3, 3],
+                stride=[1, 1],
+                padding=0,
+                dilation=[1, 1],
+                groups=1,
+                data_format="NCHW")
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"inputs": self.input_np},
+                              fetch_list=[result])
+
+    def run2(self, place):
+        with fluid.dygraph.guard(place):
+            inputs = fluid.dygraph.to_variable(self.input_np)
+            conv = paddle.nn.Conv2D(
+                in_channels=3,
+                out_channels=4,
+                kernel_size=(3, 3),
+                data_format="NCHW")
+            result = conv(inputs)
+
+    def run3(self, place):
+        with fluid.dygraph.guard(place):
+            inputs = fluid.dygraph.to_variable(self.input_np)
+            conv = paddle.fluid.dygraph.nn.Conv2D(
+                num_channels=3,
+                num_filters=4,
+                filter_size=(3, 3), )
+            result = conv(inputs)
+
+    def run_all(self, place):
+        self.run1(place)
+        self.run2(place)
+        self.run3(place)
 
     def test_environ(self):
-        fluid.set_flags({'FLAGS_conv2d_disable_cudnn': False})
-        self.run_conv2d_api()
-        fluid.set_flags({'FLAGS_conv2d_disable_cudnn': True})
-        self.run_conv2d_api()
+        self.input_np = np.random.random([2, 3, 5, 5]).astype("float32")
+        for place in [paddle.CPUPlace(), paddle.CUDAPlace(0)]:
+            fluid.set_flags({'FLAGS_conv2d_disable_cudnn': False})
+            self.run_all(place)
+            fluid.set_flags({'FLAGS_conv2d_disable_cudnn': True})
+            self.run_all(place)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
new file mode 100644
index 00000000000000..a7472e7ffd7609
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.fluid.dygraph as dg
+from op_test import OpTest
+
+
+class TestTensorBackward(unittest.TestCase):
+    def setUp(self):
+        self._dtypes = ["float32", "float64"]
+        self._places = [paddle.CPUPlace()]
+        if paddle.is_compiled_with_cuda():
+            self._places.append(paddle.CUDAPlace(0))
+
+    def test_tensor_backward(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 100]).astype(dtype)
+            y = np.random.random([100, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.random.random(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor = paddle.matmul(x_tensor, y_tensor)
+
+                    grad_tensor = paddle.to_tensor(grad)
+                    z_tensor.backward(grad_tensor)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad))
+
+
+class TestBackwardAPI(unittest.TestCase):
+    def setUp(self):
+        self._dtypes = ["float32", "float64"]
+        self._places = [paddle.CPUPlace()]
+        if paddle.is_compiled_with_cuda():
+            self._places.append(paddle.CUDAPlace(0))
+
+    def test_backward_api(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 2]).astype(dtype)
+            y = np.random.random([2, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.random.random(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor1 = paddle.matmul(x_tensor, y_tensor)
+                    z_tensor2 = paddle.matmul(x_tensor, y_tensor)
+
+                    grad_tensor = paddle.to_tensor(grad)
+                    paddle.autograd.backward([z_tensor1, z_tensor2],
+                                             [grad_tensor, grad_tensor], True)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad * 2, x_tensor.grad))
+
+    def test_backward_single_tensor(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 2]).astype(dtype)
+            y = np.random.random([2, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.random.random(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor1 = paddle.matmul(x_tensor, y_tensor)
+
+                    grad_tensor = paddle.to_tensor(grad)
+                    paddle.autograd.backward(z_tensor1, grad_tensor, True)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad))
+
+    def test_backward_none_grad_tensor(self):
+        for dtype in self._dtypes:
+            x = np.random.random([2, 2]).astype(dtype)
+            y = np.random.random([2, 2]).astype(dtype)
+            z = np.matmul(x, y)
+            grad = np.ones(z.shape).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = paddle.to_tensor(y)
+                    z_tensor1 = paddle.matmul(x_tensor, y_tensor)
+
+                    paddle.autograd.backward(z_tensor1, None)
+
+                    x_grad = np.matmul(grad, y.T)
+
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
old mode 100644
new mode 100755
index d73698e7e024a8..37494294418f1c
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -501,7 +501,12 @@ def run_trainer(self, args):
                     type(self).__name__,
                     "begin to prepare context in dygraph with nccl2")
                 dygraph.parallel.prepare_context(strategy)
-                model = dygraph.parallel.DataParallel(model, strategy)
+                if not args.find_unused_parameters:
+                    model = dygraph.parallel.DataParallel(
+                        model, strategy, find_unused_parameters=False)
+                else:
+                    model = dygraph.parallel.DataParallel(
+                        model, strategy, find_unused_parameters=True)
                 print_to_err(type(self).__name__, "model built in dygraph")
             out_losses = []
             print_to_err(type(self).__name__, "begin to run dygraph training")
@@ -574,9 +579,14 @@ def run_use_fleet_api_trainer(self, args):
         # get trainer id
         args.trainer_id = paddle.distributed.get_rank()
 
+        # set strategy
+        strategy = fleet.DistributedStrategy()
+        if not args.find_unused_parameters:
+            strategy.find_unused_parameters = False
+
         # 3. init parallel env
         if args.update_method == "nccl2" or "bkcl":
-            fleet.init(is_collective=True)
+            fleet.init(is_collective=True, strategy=strategy)
 
         # 4. train model
         model, train_reader, opt = self.get_model()
@@ -628,6 +638,7 @@ def runtime_main(test_class):
     parser.add_argument('--use_xpu', action='store_true')
     parser.add_argument('--use_dgc', action='store_true')
     parser.add_argument('--accumulate_gradient', action='store_true')
+    parser.add_argument('--find_unused_parameters', action='store_true')
     parser.add_argument('--use_reduce', action='store_true')
     parser.add_argument('--dc_asgd', action='store_true')
     parser.add_argument('--hogwild', action='store_true')
@@ -726,6 +737,7 @@ def setUp(self):
         self._save_model = False
         self._fuse_all_reduce = None
         self._accumulate_gradient = False
+        self._find_unused_parameters = True
         self._setup_config()
 
         global DIST_UT_PORT
@@ -852,6 +864,9 @@ def _run_local(self,
         if self._accumulate_gradient:
             cmd += " --accumulate_gradient"
 
+        if self._find_unused_parameters:
+            cmd += " --find_unused_parameters"
+
         env_local.update(envs)
         print("local_cmd: {}, env: {}".format(cmd, env_local))
 
@@ -1021,6 +1036,9 @@ def _get_nccl2_trainer_cmd(self, model, ep, update_method, trainer_id,
         if self._accumulate_gradient:
             tr_cmd += " --accumulate_gradient"
 
+        if self._find_unused_parameters:
+            tr_cmd += " --find_unused_parameters"
+
         if self._pipeline_mode:
             tr_cmd += " --use_pipeline"
         if self._mp_mode:
@@ -1107,6 +1125,7 @@ def _run_cluster_nccl2(self, model, envs, update_method, check_error_log,
         if check_error_log:
             print("outs[0]:", outs[0])
             print("outs[1]:", outs[1])
+
         return pickle.loads(outs[0]), pickle.loads(outs[1])
 
     def _run_pipeline(self, model, envs, check_error_log, log_name):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py
index 16584ee50081ae..a82866a797db15 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py
@@ -23,7 +23,6 @@
 
 paddle.enable_static()
 
-
 # For Net
 base_lr = 0.2
 emb_lr = base_lr * 3
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index c305f71aa53657..0dd78ea53c27b2 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -375,15 +375,9 @@ def test_errors(self):
                 out=x1)
 
             # The argument dtype of fill_constant_op must be one of bool, float16,
-            #float32, float64, int32 or int64
+            #float32, float64, uint8, int32 or int64
             x2 = fluid.layers.data(name='x2', shape=[1], dtype="int32")
 
-            self.assertRaises(
-                TypeError,
-                fluid.layers.fill_constant,
-                shape=[1],
-                value=5,
-                dtype='uint8')
             self.assertRaises(
                 TypeError,
                 fluid.layers.fill_constant,
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
index aa85eb3df35270..28803f5ac62320 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
@@ -170,7 +170,8 @@ def test_type():
             x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
                            image_shape[3]).reshape(image_shape) / 100.
             x2 = x2.astype('float16')
-            x2_var = paddle.fluid.data(name='x2', shape=[3, 2, 4, 5], dtype='float16')
+            x2_var = paddle.fluid.data(
+                name='x2', shape=[3, 2, 4, 5], dtype='float16')
             paddle.flatten(x2_var)
 
         self.assertRaises(TypeError, test_type)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
index 31771ddbd68744..d843e172763fe5 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py
@@ -179,6 +179,15 @@ def test_last_comm_group_size_MB(self):
         with self.assertRaises(ValueError):
             strategy.last_comm_group_size_MB = -1
 
+    def test_find_unused_parameters(self):
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.find_unused_parameters = True
+        self.assertEqual(strategy.find_unused_parameters, True)
+        strategy.find_unused_parameters = False
+        self.assertEqual(strategy.find_unused_parameters, False)
+        strategy.find_unused_parameters = "True"
+        self.assertEqual(strategy.find_unused_parameters, False)
+
     def test_fuse_grad_size_in_TFLOPS(self):
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy._fuse_grad_size_in_TFLOPS = 0.1
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
new file mode 100644
index 00000000000000..0960083abf28ec
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+# use paddlecloud
+echo "begin test use paddlecloud"
+cluster_node_ips="127.0.0.1,127.0.0.2"
+export PADDLE_TRAINERS_NUM=2
+export POD_IP=127.0.0.1
+export PADDLE_TRAINERS=127.0.0.1,127.0.0.2
+export PADDLE_TRAINER_ID=0
+
+export PADDLE_PORT=35789
+export TRAINER_PORTS_NUM=2
+
+distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1 --log_dir=testlog"
+python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend
+
+str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0"
+str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1"
+file_0="multi_process_fleetlaunchascend.check_0.log"
+file_1="multi_process_fleetlaunchascend.check_1.log"
+
+echo "paddlecloud params test"
+if grep -q "$str1" "$file_0"; then
+    echo "find trainer 0"
+else
+    echo "not find trainer 0"
+    exit -1
+fi
+
+if grep -q "$str2" "$file_1"; then
+    echo "find trainer 1"
+else
+    echo "not find trainer 1"
+    exit -1
+fi
+
+# test async poll process
+if [ -f $file_0 ]; then
+    rm $file_0
+fi
+if [ -f $file_1 ]; then
+    rm $file_1
+fi
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index 5da7e627f8707d..f28bf89ff5c30b 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -45,6 +45,7 @@ def test_sharding_optimizer(self):
                 "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0",
                 "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0"
             ]))
+
         self.assertEqual(ops, [
             'fill_constant', 'fill_constant', 'fill_constant',
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
@@ -55,9 +56,9 @@ def test_sharding_optimizer(self):
             'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_sync_comm_stream', 'momentum', 'momentum', 'momentum'
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'momentum',
+            'momentum', 'momentum'
         ])
 
     def test_sharding_amp_optimizer(self):
@@ -82,6 +83,7 @@ def test_sharding_amp_optimizer(self):
                 "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0",
                 "loss_scaling_0", "num_bad_steps_0", "num_good_steps_0"
             ]))
+
         self.assertEqual(ops, [
             'cast', 'cast', 'cast', 'fill_constant', 'fill_constant',
             'fill_constant', 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast',
@@ -94,11 +96,10 @@ def test_sharding_amp_optimizer(self):
             'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast',
             'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast',
             'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad',
-            'c_sync_calc_stream', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_sync_comm_stream', 'cast', 'cast', 'cast',
-            'check_finite_and_unscale', 'cast', 'c_sync_calc_stream',
-            'c_allreduce_max', 'c_sync_comm_stream', 'cast',
+            'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_sync_comm_stream', 'cast', 'cast', 'cast',
+            'check_finite_and_unscale', 'cast', 'c_allreduce_max', 'cast',
             'update_loss_scaling', 'momentum', 'momentum', 'momentum'
         ])
 
@@ -124,6 +125,7 @@ def test_sharding_recompute_optimizer(self):
                 "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0",
                 "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0"
             ]))
+
         self.assertEqual(ops, [
             'fill_constant', 'fill_constant', 'fill_constant',
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
@@ -134,10 +136,9 @@ def test_sharding_recompute_optimizer(self):
             'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'mul',
             'elementwise_add', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
             'mul', 'elementwise_add', 'tanh_grad', 'elementwise_add_grad',
-            'mul_grad', 'c_sync_calc_stream', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_sync_comm_stream',
-            'momentum', 'momentum', 'momentum'
+            'mul_grad', 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_sync_comm_stream', 'momentum', 'momentum', 'momentum'
         ])
 
     def test_sharding_amp_recompute_optimizer(self):
@@ -167,29 +168,27 @@ def test_sharding_amp_recompute_optimizer(self):
                 "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0",
                 "loss_scaling_0", "num_bad_steps_0", "num_good_steps_0"
             ]))
-
         self.assertEqual(ops, [
-            'cast', 'cast', 'cast', 'fill_constant', 'fill_constant',
+            'cast', 'cast', 'cast', 'cast', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
-            'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh',
-            'cast', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast',
-            'mul', 'elementwise_add', 'softmax', 'cast', 'cross_entropy2',
-            'mean', 'elementwise_mul', 'fill_constant', 'scale',
-            'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', 'cast',
-            'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast', 'cast',
-            'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh_grad',
-            'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul',
-            'cast', 'elementwise_add', 'cast', 'tanh_grad', 'cast',
+            'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul',
+            'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add',
+            'softmax', 'cast', 'cross_entropy2', 'mean', 'elementwise_mul',
+            'fill_constant', 'scale', 'elementwise_mul_grad', 'mean_grad',
+            'cross_entropy_grad2', 'cast', 'softmax_grad',
+            'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul',
+            'elementwise_add', 'cast', 'tanh_grad', 'cast',
+            'elementwise_add_grad', 'mul_grad', 'cast', 'mul',
+            'elementwise_add', 'cast', 'tanh_grad', 'cast',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_sync_comm_stream', 'cast', 'cast', 'cast',
-            'check_finite_and_unscale', 'cast', 'c_sync_calc_stream',
-            'c_allreduce_max', 'c_sync_comm_stream', 'cast',
-            'update_loss_scaling', 'momentum', 'momentum', 'momentum'
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'cast',
+            'cast', 'cast', 'check_finite_and_unscale', 'cast',
+            'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum',
+            'momentum', 'momentum'
         ])
 
     def test_sharding_weight_decay(self):
@@ -227,10 +226,10 @@ def test_sharding_weight_decay(self):
             'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_sync_comm_stream', 'scale', 'sum', 'scale', 'sum', 'scale',
-            'sum', 'momentum', 'momentum', 'momentum'
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'scale',
+            'sum', 'scale', 'sum', 'scale', 'sum', 'momentum', 'momentum',
+            'momentum'
         ])
 
     def test_sharding_gradient_clip(self):
@@ -253,6 +252,7 @@ def test_sharding_gradient_clip(self):
                 "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0",
                 "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0"
             ]))
+
         self.assertEqual(ops, [
             'fill_constant', 'fill_constant', 'fill_constant',
             'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
@@ -263,14 +263,12 @@ def test_sharding_gradient_clip(self):
             'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'tanh_grad',
             'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
-            'c_sync_comm_stream', 'square', 'reduce_sum', 'square',
-            'reduce_sum', 'square', 'reduce_sum', 'sum', 'c_sync_calc_stream',
-            'c_allreduce_sum', 'c_sync_comm_stream', 'sqrt', 'fill_constant',
-            'elementwise_max', 'elementwise_div', 'elementwise_mul',
-            'elementwise_mul', 'elementwise_mul', 'momentum', 'momentum',
-            'momentum'
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'square',
+            'reduce_sum', 'square', 'reduce_sum', 'square', 'reduce_sum', 'sum',
+            'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max',
+            'elementwise_div', 'elementwise_mul', 'elementwise_mul',
+            'elementwise_mul', 'momentum', 'momentum', 'momentum'
         ])
 
     def test_sharding_clone_for_test(self):
@@ -281,7 +279,8 @@ def test_sharding_clone_for_test(self):
         self.optimizer(avg_cost, strategy, train_prog, startup_prog)
         sharding.utils.comm_analyse(train_prog)
         test_prog = train_prog.clone(for_test=True)
-        sharding.utils.add_sync_comm(test_prog, strategy)
+        # assume sharding_ring_id = 1
+        sharding.utils.add_sync_comm(test_prog, 1)
         ops = [op.type for op in test_prog.global_block().ops]
 
         self.assertEqual(ops, [
@@ -293,5 +292,238 @@ def test_sharding_clone_for_test(self):
         ])
 
 
+class TestFleetMetaOptimizer(TestFleetMetaOptimizer):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "3"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002,127.0.0.1:36003,127.0.0.1:36004"
+
+    def test_sharding_with_mp(self):
+        # NOTE(JZ-LIANG) MP parallelism need user to build model with MP API
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, _ = self.net(train_prog, startup_prog)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.sharding = True
+        strategy.sharding_configs = {
+            "sharding_segment_strategy": "segment_broadcast_MB",
+            "segment_broadcast_MB": 0.2,
+            "segment_anchors": None,
+            "sharding_degree": 2,
+            "hybrid_dp": False,
+            "gradient_merge_acc_step": 1,
+            "mp_degree": 2
+        }
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+        startup_prog_ops = startup_prog.global_block().ops
+        main_prog_ops = train_prog.global_block().ops
+
+        # should has ring id for MP
+        created_ring_ids = [
+            op.desc.attr("ring_id") for op in startup_prog_ops
+            if op.type == "c_comm_init"
+        ]
+        self.assertIn(0, created_ring_ids)
+
+        # check correctness of MP group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_1":
+                sharding_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
+
+        # check correctness of sharding group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_2":
+                dp_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
+
+    def test_sharding_hybrid_dp(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, _ = self.net(train_prog, startup_prog)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.sharding = True
+        strategy.sharding_configs = {
+            "sharding_segment_strategy": "segment_broadcast_MB",
+            "segment_broadcast_MB": 0.2,
+            "segment_anchors": None,
+            "sharding_degree": 2,
+            "dp_degree": 2,
+            "hybrid_dp": True,
+            "gradient_merge_acc_step": 1,
+            "mp_degree": 1
+        }
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+        startup_prog_ops = startup_prog.global_block().ops
+        main_prog_ops = train_prog.global_block().ops
+
+        # check ring id for outter dp
+        created_ring_ids = [
+            op.desc.attr("ring_id") for op in startup_prog_ops
+            if op.type == "c_comm_init"
+        ]
+        self.assertIn(2, created_ring_ids)
+
+        # check correctness of sharding group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_1":
+                sharding_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
+
+        # check correctness of dp group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_2":
+                dp_group_waiting_ports = op.desc.attr("other_endpoints")
+        self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
+
+        # check loss scale for sharding hybrid dp
+        scale_ = -1
+        for op in main_prog_ops:
+            if op.type == "scale":
+                scale_ = float(op.desc.attr("scale"))
+        self.assertEqual(scale_, 0.25)
+
+        # check program (allreudce)
+        ops = [op.type for op in main_prog_ops]
+        self.assertEqual(ops, [
+            'fill_constant', 'fill_constant', 'fill_constant',
+            'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
+            'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
+            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
+            'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2',
+            'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
+            'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream',
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
+            'c_sync_comm_stream', 'momentum', 'momentum', 'momentum'
+        ])
+
+    def test_sharding_hybrid_dp_gm(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, _ = self.net(train_prog, startup_prog)
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.sharding = True
+        strategy.sharding_configs = {
+            "sharding_segment_strategy": "segment_broadcast_MB",
+            "segment_broadcast_MB": 0.2,
+            "segment_anchors": None,
+            "sharding_degree": 2,
+            "dp_degree": 2,
+            "hybrid_dp": True,
+            "gradient_merge_acc_step": 4,
+            "mp_degree": 1
+        }
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+        startup_prog_ops = startup_prog.global_block().ops
+        main_prog_ops = train_prog.global_block().ops
+
+        # check ring id for outter dp
+        created_ring_ids = [
+            op.desc.attr("ring_id") for op in startup_prog_ops
+            if op.type == "c_comm_init"
+        ]
+        self.assertIn(2, created_ring_ids)
+
+        # check correctness of sharding group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_1":
+                sharding_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
+
+        # check correctness of dp group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "nccl_id_2":
+                dp_group_waiting_ports = op.desc.attr("other_endpoints")
+        self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
+
+        # check program
+        fw_bw_ops = [op.type for op in train_prog.blocks[0].ops]
+        opt_ops = [op.type for op in train_prog.blocks[2].ops]
+        self.assertEqual(fw_bw_ops, [
+            'fill_constant',
+            'fill_constant',
+            'fill_constant',
+            'c_sync_calc_stream',
+            'c_broadcast',
+            'c_broadcast',
+            'c_broadcast',
+            'c_broadcast',
+            'c_broadcast',
+            'c_broadcast',
+            'c_sync_comm_stream',
+            'mul',
+            'elementwise_add',
+            'tanh',
+            'mul',
+            'elementwise_add',
+            'tanh',
+            'mul',
+            'elementwise_add',
+            'softmax',
+            'cross_entropy2',
+            'mean',
+            'fill_constant',
+            'scale',
+            'mean_grad',
+            'cross_entropy_grad2',
+            'softmax_grad',
+            'elementwise_add_grad',
+            'mul_grad',
+            'tanh_grad',
+            'elementwise_add_grad',
+            'mul_grad',
+            'tanh_grad',
+            'elementwise_add_grad',
+            'mul_grad',
+            'c_sync_calc_stream',
+            'c_reduce_sum',
+            'c_reduce_sum',
+            'c_reduce_sum',
+            'c_reduce_sum',
+            'c_reduce_sum',
+            'c_reduce_sum',
+            'c_sync_comm_stream',
+            'elementwise_add',
+            'elementwise_add',
+            'elementwise_add',
+            'increment',
+            'elementwise_mod',
+            'equal',
+            'conditional_block',
+        ])
+        self.assertEqual(opt_ops, [
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', 'scale',
+            'scale', 'scale', 'momentum', 'momentum', 'momentum',
+            'fill_constant', 'fill_constant', 'fill_constant'
+        ])
+
+        # # check loss scale for gradient merge
+        scale_ = -1
+        for op in train_prog.blocks[2].ops:
+            if op.type == "scale":
+                scale_ = float(op.desc.attr("scale"))
+                self.assertEqual(scale_, 0.25)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_full_op.py b/python/paddle/fluid/tests/unittests/test_full_op.py
index 2d850db7837722..19944aba46df0a 100644
--- a/python/paddle/fluid/tests/unittests/test_full_op.py
+++ b/python/paddle/fluid/tests/unittests/test_full_op.py
@@ -84,10 +84,7 @@ def test_errors(self):
                 TypeError, paddle.full, shape=[1], fill_value=5, dtype='uint4')
 
             # The argument dtype of full must be one of bool, float16,
-            #float32, float64, int32 or int64
-
-            self.assertRaises(
-                TypeError, paddle.full, shape=[1], fill_value=5, dtype='uint8')
+            #float32, float64, uint8, int32 or int64
 
             # The argument shape's type of full_op  must be list, tuple or Variable.
             def test_shape_type():
diff --git a/python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py b/python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py
new file mode 100644
index 00000000000000..e4c469599d72c0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import paddle
+import paddle.nn as nn
+import unittest
+from paddle.distributed import fleet
+import numpy as np
+
+
+class TestCommunicateTopology(unittest.TestCase):
+    def test_topology(self):
+        topo = fleet.CommunicateTopology(["dp", "mp", "pp"], [2, 2, 2])
+
+        # test get_comm_list
+        dp_comm_list = [[0, 4], [1, 5], [2, 6], [3, 7]]
+        mp_comm_list = [[0, 2], [1, 3], [4, 6], [5, 7]]
+        pp_comm_list = [[0, 1], [2, 3], [4, 5], [6, 7]]
+
+        np.testing.assert_array_equal(dp_comm_list, topo.get_comm_list("dp"))
+        np.testing.assert_array_equal(mp_comm_list, topo.get_comm_list("mp"))
+        np.testing.assert_array_equal(pp_comm_list, topo.get_comm_list("pp"))
+
+        # test get_hybrid_group_names
+        parallel_names = ["dp", "mp", "pp"]
+        np.testing.assert_array_equal(parallel_names,
+                                      topo.get_hybrid_group_names())
+
+        # test get_dims
+        np.testing.assert_array_equal(2, topo.get_dim("dp"))
+        np.testing.assert_array_equal(2, topo.get_dim("mp"))
+        np.testing.assert_array_equal(2, topo.get_dim("pp"))
+
+        # test world size
+        self.assertEqual(topo.world_size(), 8)
+
+        # test get_rank
+        self.assertEqual(topo.get_rank(dp=0, mp=0, pp=0), 0)
+        self.assertEqual(topo.get_rank(dp=0, mp=0, pp=1), 1)
+        self.assertEqual(topo.get_rank(dp=0, mp=1, pp=0), 2)
+        self.assertEqual(topo.get_rank(dp=0, mp=1, pp=1), 3)
+        self.assertEqual(topo.get_rank(dp=1, mp=0, pp=0), 4)
+        self.assertEqual(topo.get_rank(dp=1, mp=0, pp=1), 5)
+        self.assertEqual(topo.get_rank(dp=1, mp=1, pp=0), 6)
+        self.assertEqual(topo.get_rank(dp=1, mp=1, pp=1), 7)
+
+        # test get_coord
+        self.assertEqual(topo.get_coord(0), topo.coordinate(0, 0, 0))
+        self.assertEqual(topo.get_coord(1), topo.coordinate(0, 0, 1))
+        self.assertEqual(topo.get_coord(2), topo.coordinate(0, 1, 0))
+        self.assertEqual(topo.get_coord(3), topo.coordinate(0, 1, 1))
+        self.assertEqual(topo.get_coord(4), topo.coordinate(1, 0, 0))
+        self.assertEqual(topo.get_coord(5), topo.coordinate(1, 0, 1))
+        self.assertEqual(topo.get_coord(6), topo.coordinate(1, 1, 0))
+        self.assertEqual(topo.get_coord(7), topo.coordinate(1, 1, 1))
+
+        # test get_axis_list
+        self.assertEqual(topo.get_axis_list("dp", 0), [0, 1, 2, 3])
+        self.assertEqual(topo.get_axis_list("dp", 1), [4, 5, 6, 7])
+        self.assertEqual(topo.get_axis_list("mp", 0), [0, 1, 4, 5])
+        self.assertEqual(topo.get_axis_list("mp", 1), [2, 3, 6, 7])
+        self.assertEqual(topo.get_axis_list("pp", 0), [0, 2, 4, 6])
+        self.assertEqual(topo.get_axis_list("pp", 1), [1, 3, 5, 7])
+
+        # test get_dim_size
+        self.assertEqual(topo.get_dim_size("dp"), 2)
+        self.assertEqual(topo.get_dim_size("mp"), 2)
+        self.assertEqual(topo.get_dim_size("pp"), 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 672ffa9d394184..9f0dcdb4d8f0c2 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -930,7 +930,7 @@ def test_state_shape_mismatch(self):
             paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy.pdparams'))
 
             para_state_dict = paddle.load(
-                os.path.join('saved_dy', 'emb_dy.pdparams'))
+                os.path.join('saved_dy', 'emb_dy.pdparams'), return_numpy=True)
             para_state_dict['weight'] = np.expand_dims(
                 para_state_dict['weight'], axis=-1)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py b/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
index 2a74d29e1ee98f..645a05e75f6fba 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py
@@ -75,10 +75,12 @@ def test_main(self):
 
         self.assertEqual(actual_persistable_vars, expected_persistable_vars)
 
-        dirname = './traced_layer_test_non_persistable_vars'
-        traced_layer.save_inference_model(dirname=dirname)
-        filenames = set([f for f in os.listdir(dirname) if f != '__model__'])
-        self.assertEqual(filenames, expected_persistable_vars)
+        traced_layer.save_inference_model(
+            path='./traced_layer_test_non_persistable_vars')
+        self.assertTrue('traced_layer_test_non_persistable_vars.pdmodel' in
+                        os.listdir('./'))
+        self.assertTrue('traced_layer_test_non_persistable_vars.pdiparams' in
+                        os.listdir('./'))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_l1_loss.py b/python/paddle/fluid/tests/unittests/test_l1_loss.py
index fba16959901a88..c35188623b4400 100644
--- a/python/paddle/fluid/tests/unittests/test_l1_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_l1_loss.py
@@ -44,8 +44,10 @@ def run_imperative(self):
         self.assertTrue(dy_result.shape, [10, 10, 5])
 
     def run_static(self, use_gpu=False):
-        input = paddle.fluid.data(name='input', shape=[10, 10, 5], dtype='float32')
-        label = paddle.fluid.data(name='label', shape=[10, 10, 5], dtype='float32')
+        input = paddle.fluid.data(
+            name='input', shape=[10, 10, 5], dtype='float32')
+        label = paddle.fluid.data(
+            name='label', shape=[10, 10, 5], dtype='float32')
         result0 = paddle.nn.functional.l1_loss(input, label)
         result1 = paddle.nn.functional.l1_loss(input, label, reduction='sum')
         result2 = paddle.nn.functional.l1_loss(input, label, reduction='none')
@@ -127,8 +129,10 @@ def run_imperative(self):
         self.assertTrue(dy_result.shape, [10, 10, 5])
 
     def run_static(self, use_gpu=False):
-        input = paddle.fluid.data(name='input', shape=[10, 10, 5], dtype='float32')
-        label = paddle.fluid.data(name='label', shape=[10, 10, 5], dtype='float32')
+        input = paddle.fluid.data(
+            name='input', shape=[10, 10, 5], dtype='float32')
+        label = paddle.fluid.data(
+            name='label', shape=[10, 10, 5], dtype='float32')
         l1_loss = paddle.nn.loss.L1Loss()
         result0 = l1_loss(input, label)
         l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh b/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh
index bee230fba5a7e2..d9d64e4dfa693a 100644
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 unset https_proxy http_proxy
 
 nohup python -u test_listen_and_serv_op.py > test_listen_and_serv_op.log 2>&1 &
diff --git a/python/paddle/fluid/tests/unittests/test_monitor.py b/python/paddle/fluid/tests/unittests/test_monitor.py
index cf273876b1f2f8..bea2f6c8b38b23 100644
--- a/python/paddle/fluid/tests/unittests/test_monitor.py
+++ b/python/paddle/fluid/tests/unittests/test_monitor.py
@@ -17,6 +17,8 @@
 
 from __future__ import print_function
 import paddle
+paddle.enable_static()
+
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import numpy as np
@@ -52,6 +54,11 @@ def test_dataset_run_with_stat(self):
                 name=slot, shape=[1], dtype="int64", lod_level=1)
             slots_vars.append(var)
 
+        embs = []
+        for x in slots_vars:
+            emb = fluid.layers.embedding(x, is_sparse=True, size=[100001, 4])
+            embs.append(emb)
+
         dataset = paddle.distributed.InMemoryDataset()
         dataset._set_batch_size(32)
         dataset._set_thread(3)
@@ -74,11 +81,17 @@ def test_dataset_run_with_stat(self):
             for i in range(self.epoch_num):
                 for data in data_loader():
                     exe.run(fluid.default_main_program(), feed=data)
+
         else:
             for i in range(self.epoch_num):
                 try:
-                    exe.train_from_dataset(fluid.default_main_program(),
-                                           dataset)
+                    exe.train_from_dataset(
+                        fluid.default_main_program(),
+                        dataset,
+                        fetch_list=[embs[0], embs[1]],
+                        fetch_info=["emb0", "emb1"],
+                        print_period=1)
+
                 except Exception as e:
                     self.assertTrue(False)
 
diff --git a/python/paddle/fluid/tests/unittests/test_mse_loss.py b/python/paddle/fluid/tests/unittests/test_mse_loss.py
index bc5d35d3254bc4..89eef6ca24243c 100644
--- a/python/paddle/fluid/tests/unittests/test_mse_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_mse_loss.py
@@ -191,8 +191,10 @@ def test_NNFunctionalMseLoss_mean(self):
             place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                input = paddle.fluid.data(name='input', shape=dim, dtype='float32')
-                target = paddle.fluid.data(name='target', shape=dim, dtype='float32')
+                input = paddle.fluid.data(
+                    name='input', shape=dim, dtype='float32')
+                target = paddle.fluid.data(
+                    name='target', shape=dim, dtype='float32')
                 mse_loss = paddle.nn.functional.mse_loss(input, target, 'mean')
 
             exe = paddle.static.Executor(place)
@@ -225,8 +227,10 @@ def test_NNFunctionalMseLoss_sum(self):
             place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                input = paddle.fluid.data(name='input', shape=dim, dtype='float32')
-                target = paddle.fluid.data(name='target', shape=dim, dtype='float32')
+                input = paddle.fluid.data(
+                    name='input', shape=dim, dtype='float32')
+                target = paddle.fluid.data(
+                    name='target', shape=dim, dtype='float32')
                 mse_loss = paddle.nn.functional.mse_loss(input, target, 'sum')
 
                 exe = paddle.static.Executor(place)
@@ -259,8 +263,10 @@ def test_NNFunctionalMseLoss_none(self):
             place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else paddle.CPUPlace()
             with paddle.static.program_guard(prog, startup_prog):
-                input = paddle.fluid.data(name='input', shape=dim, dtype='float32')
-                target = paddle.fluid.data(name='target', shape=dim, dtype='float32')
+                input = paddle.fluid.data(
+                    name='input', shape=dim, dtype='float32')
+                target = paddle.fluid.data(
+                    name='target', shape=dim, dtype='float32')
                 mse_loss = paddle.nn.functional.mse_loss(input, target, 'none')
 
                 exe = paddle.static.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
index 0533a0d09fa0de..3bb3e843b1b11a 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py
@@ -160,5 +160,6 @@ def run_main(self, num_workers, places):
         print("time cost", ret['time'], 'step_list', ret['step'])
         return ret
 
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_new_group.sh b/python/paddle/fluid/tests/unittests/test_new_group.sh
new file mode 100755
index 00000000000000..d0b29a64145c60
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_new_group.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch --gpus=0,1  new_group.py
+CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch --gpus=0,1  hybrid_communicate_group.py
diff --git a/python/paddle/fluid/tests/unittests/test_new_group_api.py b/python/paddle/fluid/tests/unittests/test_new_group_api.py
new file mode 100644
index 00000000000000..b9b80d3b431eae
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_new_group_api.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveAllreduceAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_allreduce_nccl(self):
+        self.check_with_place("collective_allreduce_new_group_api.py",
+                              "allreduce", "nccl")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index 06f63d1416b8f4..b58d63969a5e5c 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -18,10 +18,15 @@
 import numpy as np
 import os
 import sys
+import six
 
 import paddle
 import paddle.nn as nn
 import paddle.optimizer as opt
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import Adam
+import paddle.fluid.framework as framework
+from test_imperative_base import new_program_scope
 
 BATCH_SIZE = 16
 BATCH_NUM = 4
@@ -31,7 +36,10 @@
 IMAGE_SIZE = 784
 CLASS_NUM = 10
 
-LARGE_PARAM = 2**26
+if six.PY2:
+    LARGE_PARAM = 2**2
+else:
+    LARGE_PARAM = 2**26
 
 
 def random_batch_reader():
@@ -95,15 +103,22 @@ def test_large_parameters_paddle_save(self):
 
         path = os.path.join("test_paddle_save_load_large_param_save",
                             "layer.pdparams")
-        paddle.save(layer.state_dict(), path)
+        if six.PY2:
+            protocol = 2
+        else:
+            protocol = 4
+        paddle.save(save_dict, path, protocol=protocol)
         dict_load = paddle.load(path)
         # compare results before and after saving
         for key, value in save_dict.items():
-            self.assertTrue(np.array_equal(dict_load[key], value.numpy()))
+            self.assertTrue(
+                np.array_equal(dict_load[key].numpy(), value.numpy()))
 
 
 class TestSaveLoadPickle(unittest.TestCase):
     def test_pickle_protocol(self):
+        # enable dygraph mode
+        paddle.disable_static()
         # create network
         layer = LinearNet()
         save_dict = layer.state_dict()
@@ -124,11 +139,236 @@ def test_pickle_protocol(self):
         if sys.version_info.major >= 3 and sys.version_info.minor >= 4:
             protocols += [3, 4]
         for protocol in protocols:
-            paddle.save(save_dict, path, protocol)
+            paddle.save(save_dict, path, pickle_protocol=protocol)
             dict_load = paddle.load(path)
             # compare results before and after saving
             for key, value in save_dict.items():
-                self.assertTrue(np.array_equal(dict_load[key], value.numpy()))
+                self.assertTrue(
+                    np.array_equal(dict_load[key].numpy(), value.numpy()))
+
+
+class TestSaveLoadAny(unittest.TestCase):
+    def set_zero(self, prog, place, scope=None):
+        if scope is None:
+            scope = fluid.global_scope()
+        for var in prog.list_vars():
+            if isinstance(var, framework.Parameter) or var.persistable:
+                ten = scope.find_var(var.name).get_tensor()
+                if ten is not None:
+                    ten.set(np.zeros_like(np.array(ten)), place)
+                    new_t = np.array(scope.find_var(var.name).get_tensor())
+                    self.assertTrue(np.sum(np.abs(new_t)) == 0)
+
+    def replace_static_save(self, program, model_path, pickle_protocol=2):
+        with self.assertRaises(TypeError):
+            program.state_dict(1)
+        with self.assertRaises(TypeError):
+            program.state_dict(scope=1)
+        with self.assertRaises(ValueError):
+            program.state_dict('x')
+        state_dict_param = program.state_dict('param')
+        paddle.save(state_dict_param, model_path + '.pdparams')
+        state_dict_opt = program.state_dict('opt')
+        paddle.save(state_dict_opt, model_path + '.pdopt')
+        state_dict_all = program.state_dict()
+        paddle.save(state_dict_opt, model_path + '.pdall')
+
+    def replace_static_load(self, program, model_path):
+        with self.assertRaises(TypeError):
+            program.set_state_dict(1)
+        state_dict_param = paddle.load(model_path + '.pdparams')
+        state_dict_param['fake_var_name.@@'] = np.random.randn(1, 2)
+        state_dict_param['static_x'] = 'UserWarning'
+        program.set_state_dict(state_dict_param)
+        state_dict_param['static_x'] = np.random.randn(1, 2)
+        program.set_state_dict(state_dict_param)
+        program.set_state_dict(state_dict_param)
+        state_dict_opt = paddle.load(model_path + '.pdopt')
+        program.set_state_dict(state_dict_opt)
+
+    def test_replace_static_save_load(self):
+        paddle.enable_static()
+        with new_program_scope():
+            x = paddle.static.data(
+                name="static_x", shape=[None, IMAGE_SIZE], dtype='float32')
+            z = paddle.static.nn.fc(x, 10)
+            z = paddle.static.nn.fc(z, 10, bias_attr=False)
+            loss = fluid.layers.reduce_mean(z)
+            opt = Adam(learning_rate=1e-3)
+            opt.minimize(loss)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            prog = paddle.static.default_main_program()
+            fake_inputs = np.random.randn(2, IMAGE_SIZE).astype('float32')
+            exe.run(prog, feed={'static_x': fake_inputs}, fetch_list=[loss])
+            base_map = {}
+            for var in prog.list_vars():
+                if isinstance(var, framework.Parameter) or var.persistable:
+                    t = np.array(fluid.global_scope().find_var(var.name)
+                                 .get_tensor())
+                    base_map[var.name] = t
+            path = os.path.join("test_replace_static_save_load", "model")
+            # paddle.save, legacy paddle.fluid.load
+            self.replace_static_save(prog, path)
+            self.set_zero(prog, place)
+            paddle.fluid.io.load(prog, path)
+            for var in prog.list_vars():
+                if isinstance(var, framework.Parameter) or var.persistable:
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+                    self.assertTrue(np.array_equal(new_t, np.array(base_t)))
+            # legacy paddle.fluid.save, paddle.load 
+            paddle.fluid.io.save(prog, path)
+            self.set_zero(prog, place)
+            self.replace_static_load(prog, path)
+            for var in prog.list_vars():
+                if isinstance(var, framework.Parameter) or var.persistable:
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+                    self.assertTrue(np.array_equal(new_t, base_t))
+            # test for return tensor
+            path_vars = 'test_replace_save_load_return_tensor_static/model'
+            for var in prog.list_vars():
+                if var.persistable:
+                    tensor = var.get_value(fluid.global_scope())
+                    paddle.save(tensor, os.path.join(path_vars, var.name))
+            with self.assertRaises(TypeError):
+                var.get_value('fluid.global_scope()')
+            with self.assertRaises(ValueError):
+                x.get_value()
+            with self.assertRaises(TypeError):
+                x.set_value('1')
+            fake_data = np.zeros([3, 2, 1, 2, 3])
+            with self.assertRaises(TypeError):
+                x.set_value(fake_data, '1')
+            with self.assertRaises(ValueError):
+                x.set_value(fake_data)
+            with self.assertRaises(ValueError):
+                var.set_value(fake_data)
+            # set var to zero
+            self.set_zero(prog, place)
+            for var in prog.list_vars():
+                if var.persistable:
+                    tensor = paddle.load(
+                        os.path.join(path_vars, var.name), return_numpy=False)
+                    var.set_value(tensor)
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+                    self.assertTrue(np.array_equal(new_t, base_t))
+
+    def test_paddle_save_load_v2(self):
+        paddle.disable_static()
+        layer = LinearNet()
+        state_dict = layer.state_dict()
+        path = 'paddle_save_load_v2/model.pdparams'
+        with self.assertRaises(TypeError):
+            paddle.save(state_dict, path, use_binary_format='False')
+        # legacy paddle.save, paddle.load
+        paddle.framework.io._legacy_save(state_dict, path)
+        load_dict_tensor = paddle.load(path, return_numpy=False)
+        # legacy paddle.load, paddle.save
+        paddle.save(state_dict, path)
+        load_dict_np = paddle.framework.io._legacy_load(path)
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(v.numpy(), load_dict_tensor[k].numpy()))
+            self.assertTrue(np.array_equal(v.numpy(), load_dict_np[k]))
+
+    def test_single_pickle_var_dygraph(self):
+        # enable dygraph mode
+        paddle.disable_static()
+        layer = LinearNet()
+        path = 'paddle_save_load_v2/var_dygraph'
+        tensor = layer._linear.weight
+        with self.assertRaises(ValueError):
+            paddle.save(tensor, path, pickle_protocol='3')
+        with self.assertRaises(ValueError):
+            paddle.save(tensor, path, pickle_protocol=5)
+        paddle.save(tensor, path)
+        t_dygraph = paddle.load(path)
+        np_dygraph = paddle.load(path, return_numpy=True)
+        self.assertTrue(isinstance(t_dygraph, paddle.fluid.core.VarBase))
+        self.assertTrue(np.array_equal(tensor.numpy(), np_dygraph))
+        self.assertTrue(np.array_equal(tensor.numpy(), t_dygraph.numpy()))
+        paddle.enable_static()
+        lod_static = paddle.load(path)
+        np_static = paddle.load(path, return_numpy=True)
+        self.assertTrue(isinstance(lod_static, paddle.fluid.core.LoDTensor))
+        self.assertTrue(np.array_equal(tensor.numpy(), np_static))
+        self.assertTrue(np.array_equal(tensor.numpy(), np.array(lod_static)))
+
+    def test_single_pickle_var_static(self):
+        # enable static mode
+        paddle.enable_static()
+        with new_program_scope():
+            # create network
+            x = paddle.static.data(
+                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            z = paddle.static.nn.fc(x, 128)
+            loss = fluid.layers.reduce_mean(z)
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            prog = paddle.static.default_main_program()
+            for var in prog.list_vars():
+                if list(var.shape) == [IMAGE_SIZE, 128]:
+                    tensor = var.get_value()
+                    break
+            scope = fluid.global_scope()
+        origin_tensor = np.array(tensor)
+        path = 'test_single_pickle_var_static/var'
+        paddle.save(tensor, path)
+        self.set_zero(prog, place, scope)
+        # static load
+        lod_static = paddle.load(path)
+        np_static = paddle.load(path, return_numpy=True)
+        # set_tensor(np.ndarray)
+        var.set_value(np_static, scope)
+        self.assertTrue(np.array_equal(origin_tensor, np.array(tensor)))
+        # set_tensor(LoDTensor)
+        self.set_zero(prog, place, scope)
+        var.set_value(lod_static, scope)
+        self.assertTrue(np.array_equal(origin_tensor, np.array(tensor)))
+        # enable dygraph mode
+        paddle.disable_static()
+        var_dygraph = paddle.load(path)
+        np_dygraph = paddle.load(path, return_numpy=True)
+        self.assertTrue(np.array_equal(np.array(tensor), np_dygraph))
+        self.assertTrue(np.array_equal(np.array(tensor), var_dygraph.numpy()))
+
+    def test_dygraph_save_static_load(self):
+        inps = np.random.randn(1, IMAGE_SIZE).astype('float32')
+        path = 'test_dygraph_save_static_load/dy-static.pdparams'
+        paddle.disable_static()
+        with paddle.utils.unique_name.guard():
+            layer = LinearNet()
+            state_dict_dy = layer.state_dict()
+            paddle.save(state_dict_dy, path)
+        paddle.enable_static()
+        with new_program_scope():
+            layer = LinearNet()
+            data = paddle.static.data(
+                name='x_static_save', shape=(None, IMAGE_SIZE), dtype='float32')
+            y_static = layer(data)
+            program = paddle.static.default_main_program()
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = paddle.static.Executor(paddle.CPUPlace())
+            exe.run(paddle.static.default_startup_program())
+            state_dict = paddle.load(path, keep_name_table=True)
+            program.set_state_dict(state_dict)
+            state_dict_param = program.state_dict("param")
+            for name, tensor in state_dict_dy.items():
+                self.assertTrue(
+                    np.array_equal(tensor.numpy(),
+                                   np.array(state_dict_param[tensor.name])))
 
 
 class TestSaveLoad(unittest.TestCase):
@@ -158,7 +398,9 @@ def build_and_train_model(self):
 
     def check_load_state_dict(self, orig_dict, load_dict):
         for var_name, value in orig_dict.items():
-            self.assertTrue(np.array_equal(value.numpy(), load_dict[var_name]))
+            load_value = load_dict[var_name].numpy() if hasattr(
+                load_dict[var_name], 'numpy') else np.array(load_dict[var_name])
+            self.assertTrue(np.array_equal(value.numpy(), load_value))
 
     def test_save_load(self):
         layer, opt = self.build_and_train_model()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
new file mode 100644
index 00000000000000..fa571bde5e43bf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import sys
+import unittest
+
+import paddle.fluid as fluid
+from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestDygraphControlFlowSame(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+
+    def test_net(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_control_flow_same.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+class TestFleetDygraphControlFlowSame(TestDygraphControlFlowSame):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._use_fleet_api = True
+
+
+class TestFleetDygraphControlFlowSameAccGrad(TestDygraphControlFlowSame):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._accumulate_gradient = True
+
+
+class TestDygraphControlFlowDiff(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+
+    def test_net(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_control_flow_different.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+class TestFleetDygraphControlFlowDiff(TestDygraphControlFlowDiff):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._use_fleet_api = True
+
+
+class TestFleetDygraphControlFlowDiffAccGrad(TestDygraphControlFlowDiff):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._accumulate_gradient = True
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
new file mode 100644
index 00000000000000..1d2a39751905e2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import time
+import paddle.fluid as fluid
+
+from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, get_gpus, start_local_trainers
+
+
+def get_cluster_from_args(selected_gpus):
+    cluster_node_ips = '127.0.0.1'
+    node_ip = '127.0.0.1'
+
+    node_ips = [x.strip() for x in cluster_node_ips.split(',')]
+
+    node_ips.index(node_ip)
+
+    free_ports = None
+
+    free_ports = find_free_ports(len(selected_gpus))
+    if free_ports is not None:
+        free_ports = list(free_ports)
+
+    trainer_endpoints = []
+    for ip in node_ips:
+        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
+    return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus)
+
+
+class TestMultipleGpus(unittest.TestCase):
+    def run_mnist_2gpu(self, target_file_name):
+        if not fluid.core.is_compiled_with_cuda(
+        ) or fluid.core.get_cuda_device_count() == 0:
+            return
+
+        selected_gpus = get_gpus('0,1')
+        cluster = None
+        pod = None
+
+        cluster, pod = get_cluster_from_args(selected_gpus)
+
+        procs = start_local_trainers(
+            cluster,
+            pod,
+            training_script=target_file_name,
+            training_script_args=[])
+
+        while True:
+            alive = watch_local_trainers(procs, cluster.trainers_nranks())
+
+            if not alive:
+                print("Local procs complete, POD info:{}".format(pod))
+                break
+            time.sleep(3)
+
+    def test_multiple_gpus_dynamic(self):
+        self.run_mnist_2gpu('parallel_dygraph_gradient_check.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
index a3a3c5bfe3df59..782d2304619f2a 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py
@@ -73,6 +73,7 @@ def _setup_config(self):
         self._dygraph = True
         self._use_fleet_api = True
         self._accumulate_gradient = True
+        self._find_unused_parameters = False
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
index bef64385f135b3..e0aab8541a542c 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py
@@ -54,6 +54,7 @@ def _setup_config(self):
         self._nccl2_mode = True
         self._dygraph = True
         self._accumulate_gradient = True
+        self._find_unused_parameters = False
 
     def test_transformer(self):
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
index 5906114cd24f32..75fa6f7c71d0a5 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py
@@ -26,13 +26,13 @@
 flag_name = os.path.splitext(__file__)[0]
 
 
-class TestParallelDygraphMnist(TestDistBase):
+class TestParallelDygraphUnusedVar(TestDistBase):
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
         self._dygraph = True
 
-    def test_mnist(self):
+    def test_net(self):
         if fluid.core.is_compiled_with_cuda():
             self.check_with_place(
                 "parallel_dygraph_unused_variables.py",
@@ -41,6 +41,14 @@ def test_mnist(self):
                 log_name=flag_name)
 
 
+class TestFleetDygraphUnusedVar(TestParallelDygraphUnusedVar):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+        self._use_fleet_api = True
+
+
 class TestSparseEmbeddingUnusedVarsSpawn(TestDistSpawnRunner):
     def test_mnist_with_spawn(self):
         if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
@@ -48,17 +56,31 @@ def test_mnist_with_spawn(self):
                 test_class=TestSparseEmbeddingUnusedVars, delta=1e-5)
 
 
-class TestFleetDygraphMnist(TestDistBase):
+class TestParallelDygraphNoVar(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+
+    def test_net(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_none_var.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+class TestParallelDygraphSharedUnusedVariables(TestDistBase):
     def _setup_config(self):
         self._sync_mode = False
         self._nccl2_mode = True
         self._dygraph = True
-        self._use_fleet_api = True
 
     def test_mnist(self):
         if fluid.core.is_compiled_with_cuda():
             self.check_with_place(
-                "parallel_dygraph_unused_variables.py",
+                "parallel_dygraph_shared_unused_var.py",
                 delta=1e-5,
                 check_error_log=True,
                 log_name=flag_name)
diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
index f75d6e9df540b5..f1a409c712fc32 100644
--- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
+++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py
@@ -97,8 +97,10 @@ def test_static_graph_functional(self):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
 
             paddle.enable_static()
-            x_1 = paddle.fluid.data(name="x", shape=[2, 9, 4, 4], dtype="float64")
-            x_2 = paddle.fluid.data(name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            x_1 = paddle.fluid.data(
+                name="x", shape=[2, 9, 4, 4], dtype="float64")
+            x_2 = paddle.fluid.data(
+                name="x2", shape=[2, 4, 4, 9], dtype="float64")
             out_1 = F.pixel_shuffle(x_1, 3)
             out_2 = F.pixel_shuffle(x_2, 3, "NHWC")
 
@@ -123,8 +125,10 @@ def test_static_graph_layer(self):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
 
             paddle.enable_static()
-            x_1 = paddle.fluid.data(name="x", shape=[2, 9, 4, 4], dtype="float64")
-            x_2 = paddle.fluid.data(name="x2", shape=[2, 4, 4, 9], dtype="float64")
+            x_1 = paddle.fluid.data(
+                name="x", shape=[2, 9, 4, 4], dtype="float64")
+            x_2 = paddle.fluid.data(
+                name="x2", shape=[2, 4, 4, 9], dtype="float64")
             # init instance
             ps_1 = paddle.nn.PixelShuffle(3)
             ps_2 = paddle.nn.PixelShuffle(3, "NHWC")
diff --git a/python/paddle/fluid/tests/unittests/test_prod_op.py b/python/paddle/fluid/tests/unittests/test_prod_op.py
index 15fd79542d608f..cdfcbb4e4e735d 100644
--- a/python/paddle/fluid/tests/unittests/test_prod_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prod_op.py
@@ -55,7 +55,8 @@ def run_imperative(self):
         self.assertTrue(np.allclose(dy_result.numpy(), expected_result))
 
     def run_static(self, use_gpu=False):
-        input = paddle.fluid.data(name='input', shape=[10, 10, 5], dtype='float32')
+        input = paddle.fluid.data(
+            name='input', shape=[10, 10, 5], dtype='float32')
         result0 = paddle.prod(input)
         result1 = paddle.prod(input, axis=1)
         result2 = paddle.prod(input, axis=-1)
@@ -114,7 +115,8 @@ def test_error(self):
         with paddle.static.program_guard(paddle.static.Program(),
                                          paddle.static.Program()):
             x = paddle.fluid.data(name='x', shape=[2, 2, 4], dtype='float32')
-            bool_x = paddle.fluid.data(name='bool_x', shape=[2, 2, 4], dtype='bool')
+            bool_x = paddle.fluid.data(
+                name='bool_x', shape=[2, 2, 4], dtype='bool')
             # The argument x shoule be a Tensor
             self.assertRaises(TypeError, paddle.prod, [1])
 
diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py
index 95ae1eecc66141..e71adae8d9b6eb 100644
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
@@ -128,15 +128,18 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.selu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.selu, x_int32)
             # The scale must be greater than 1.0
-            x_fp32 = paddle.fluid.data(name='x_fp32', shape=[12, 10], dtype='float32')
+            x_fp32 = paddle.fluid.data(
+                name='x_fp32', shape=[12, 10], dtype='float32')
             self.assertRaises(ValueError, F.selu, x_fp32, -1.0)
             # The alpha must be no less than 0
             self.assertRaises(ValueError, F.selu, x_fp32, 1.6, -1.0)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
             F.selu(x_fp16)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
index 85f9501e53f4ab..2ef04d9cbfa73f 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py
@@ -42,8 +42,10 @@ def test_static(place,
     prog = paddle.static.Program()
     startup_prog = paddle.static.Program()
     with paddle.static.program_guard(prog, startup_prog):
-        logit = paddle.fluid.data(name='logit', shape=logit_np.shape, dtype='float64')
-        label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64')
+        logit = paddle.fluid.data(
+            name='logit', shape=logit_np.shape, dtype='float64')
+        label = paddle.fluid.data(
+            name='label', shape=label_np.shape, dtype='float64')
         feed_dict = {"logit": logit_np, "label": label_np}
 
         normalizer = None
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load_large.py b/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
index 08413d711be55b..c5dc98af5c8f6d 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load_large.py
@@ -25,12 +25,17 @@
 import pickle
 import os
 
+# Python2.x no longer supports saving and loading large parameters.
+if six.PY2:
+    LARGE_PARAM = 2
+else:
+    LARGE_PARAM = 2**26
+
 
 class TestStaticSaveLoadLargeParameters(unittest.TestCase):
     def test_large_parameters_static_save(self):
         # enable static mode
         paddle.enable_static()
-        LARGE_PARAM = 2**26
         with new_program_scope():
             # create network
             x = paddle.static.data(
@@ -54,7 +59,11 @@ def test_large_parameters_static_save(self):
 
             path = os.path.join("test_static_save_load_large_param",
                                 "static_save")
-            paddle.fluid.save(prog, path)
+            if six.PY2:
+                protocol = 2
+            else:
+                protocol = 4
+            paddle.fluid.save(prog, path, pickle_protocol=protocol)
             # set var to zero
             for var in prog.list_vars():
                 if isinstance(var, framework.Parameter) or var.persistable:
@@ -92,3 +101,7 @@ def test_large_parameters_static_save(self):
                                      .get_tensor())
                     base_t = base_map[var.name]
                     self.assertTrue(np.array_equal(new_t, base_t))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
new file mode 100644
index 00000000000000..a390dd9d807564
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -0,0 +1,413 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+
+
+class SimpleNet(nn.Layer):
+    def __init__(self, in_size, out_size):
+        super(SimpleNet, self).__init__()
+        self.linear1 = nn.Linear(in_size, in_size)
+        self.linear2 = nn.Linear(in_size, out_size)
+
+    def forward(self, x, hook=None, register=False, remove=False):
+        ret1 = self.linear1(x)
+        if hook is not None:
+            if register:
+                h = ret1.register_hook(hook)
+                if remove:
+                    h.remove()
+        ret2 = self.linear2(ret1)
+        out = paddle.mean(ret2, axis=-1)
+        return ret1, out
+
+
+class TestTensorRegisterHook(unittest.TestCase):
+    def setUp(self):
+        self.seed = 2021
+        self.in_size = 10
+        self.out_size = 10
+        self.batch_size = 4
+        self.devices = ["cpu"]
+        if paddle.is_compiled_with_cuda():
+            self.devices.append("gpu")
+
+    def test_hook_for_interior_var(self):
+        def run_double_hook_for_interior_var(double_hook, removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                x = paddle.to_tensor([0., 1., 2., 3.])
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                x.stop_gradient = False
+                y.stop_gradient = False
+
+                w = x + y
+                w.stop_gradient = False
+                helper = w.register_hook(double_hook)
+
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                z.stop_gradient = False
+
+                o = z.matmul(w)
+
+                # remove hook before backward
+                if removed:
+                    helper.remove()
+
+                o.backward()
+
+                # z.grad is not affected
+                self.assertTrue(np.array_equal(z.grad, w.numpy()))
+                # w.grad is not changed by hook
+                self.assertTrue(np.array_equal(w.grad, z.numpy()))
+                # x.grad and y.grad are changed if run hook
+                self.assertTrue(
+                    np.array_equal(x.grad,
+                                   z.numpy() * 2 if not removed else z.numpy()))
+                self.assertTrue(
+                    np.array_equal(y.grad,
+                                   z.numpy() * 2 if not removed else z.numpy()))
+
+        def run_print_hook_for_interior_var(print_hook, removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                x = paddle.to_tensor([0., 1., 2., 3.])
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                x.stop_gradient = False
+                y.stop_gradient = False
+
+                w = x + y
+                w.stop_gradient = False
+                helper = w.register_hook(print_hook)
+
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                z.stop_gradient = False
+
+                o = z.matmul(w)
+
+                # remove hook before backward
+                if removed:
+                    helper.remove()
+
+                o.backward()
+
+                # all grads are not affected
+                self.assertTrue(np.array_equal(z.grad, w.numpy()))
+                self.assertTrue(np.array_equal(w.grad, z.numpy()))
+                self.assertTrue(np.array_equal(x.grad, z.numpy()))
+                self.assertTrue(np.array_equal(y.grad, z.numpy()))
+
+        def double_hook(grad):
+            grad = grad * 2
+            print(grad)
+            return grad
+
+        def print_hook(grad):
+            print(grad)
+
+        # register hook
+        run_double_hook_for_interior_var(double_hook)
+        # register hook and removed
+        run_double_hook_for_interior_var(double_hook, removed=True)
+
+        # register hook
+        run_double_hook_for_interior_var(lambda grad: grad * 2)
+        # register hook and removed
+        run_double_hook_for_interior_var(lambda grad: grad * 2, removed=True)
+
+        # register hook
+        run_print_hook_for_interior_var(print_hook)
+        # register hook and removed
+        run_print_hook_for_interior_var(print_hook, removed=True)
+
+    def test_hook_for_leaf_var(self):
+        def run_double_hook_for_leaf_var(double_hook, removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                x = paddle.to_tensor([0., 1., 2., 3.])
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                x.stop_gradient = False
+                y.stop_gradient = False
+                helper = y.register_hook(double_hook)
+
+                w = x + y
+                w.stop_gradient = False
+
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                z.stop_gradient = False
+
+                o = z.matmul(w)
+
+                # remove hook before backward
+                if removed:
+                    helper.remove()
+
+                o.backward()
+
+                # z.grad, w.grad, x.grad is not affected
+                self.assertTrue(np.array_equal(z.grad, w.numpy()))
+                self.assertTrue(np.array_equal(w.grad, z.numpy()))
+                self.assertTrue(np.array_equal(x.grad, z.numpy()))
+                # y.grad are changed if run hook
+                self.assertTrue(
+                    np.array_equal(y.grad,
+                                   z.numpy() * 2 if not removed else z.numpy()))
+
+        # register hook
+        run_double_hook_for_leaf_var(lambda grad: grad * 2)
+        # register hook and removed
+        run_double_hook_for_leaf_var(lambda grad: grad * 2, removed=True)
+
+    def test_hook_for_accumulated_grad(self):
+        def run_double_hook_for_accumulated_grad(double_hook, removed=False):
+            for device in self.devices:
+                paddle.set_device(device)
+
+                a = paddle.to_tensor([0., 1., 1., 2.])
+                b = paddle.to_tensor([0., 0., 1., 2.])
+                a.stop_gradient = False
+                b.stop_gradient = False
+
+                helper1 = a.register_hook(double_hook)
+
+                x = a + b
+                x.stop_gradient = False
+
+                helper2 = x.register_hook(double_hook)
+
+                y = paddle.to_tensor([4., 5., 6., 7.])
+                z = paddle.to_tensor([1., 2., 3., 4.])
+                y.stop_gradient = False
+                z.stop_gradient = False
+
+                o1 = x + y
+                o2 = x + z
+                o1.stop_gradient = False
+                o2.stop_gradient = False
+
+                o = o1.matmul(o2)
+
+                # remove hook before backward
+                if removed:
+                    helper1.remove()
+                    helper2.remove()
+
+                o.backward()
+
+                base_grad = np.array([5., 9., 13., 19.])
+                # x.grad is not changed
+                self.assertTrue(np.array_equal(x.grad, base_grad))
+                # b.grad is changed by x.hook
+                self.assertTrue(
+                    np.array_equal(b.grad, base_grad * 2
+                                   if not removed else base_grad))
+                # a.grad is changed by x.hook and a.hook
+                self.assertTrue(
+                    np.array_equal(a.grad, base_grad * 4
+                                   if not removed else base_grad))
+
+        # register hook
+        run_double_hook_for_accumulated_grad(lambda grad: grad * 2)
+        # register hook and removed
+        run_double_hook_for_accumulated_grad(
+            lambda grad: grad * 2, removed=True)
+
+    def test_hook_in_model(self):
+        def run_double_hook_in_model(data,
+                                     label,
+                                     hook=None,
+                                     register=False,
+                                     remove=False):
+            for device in self.devices:
+                paddle.seed(self.seed)
+                paddle.set_device(device)
+
+                net = SimpleNet(self.in_size, self.out_size)
+                loss_fn = nn.MSELoss()
+
+                data = paddle.to_tensor(data)
+                label = paddle.to_tensor(label)
+
+                ret1, out = net(data, hook, register, remove)
+                loss = loss_fn(out, label)
+                loss.backward()
+
+                return ret1.grad, net.linear1.weight.grad, net.linear1.bias.grad
+
+        data = np.random.uniform(
+            size=[self.batch_size, self.in_size]).astype('float32')
+        label = np.random.uniform(size=[self.batch_size, 1]).astype('float32')
+
+        # get original value
+        ret1_grad, linear1_w_grad, linear1_b_grad = run_double_hook_in_model(
+            data, label)
+        # get value changed by hook
+        ret1_grad_hook, linear1_w_grad_hook, linear1_b_grad_hook = run_double_hook_in_model(
+            data, label, lambda grad: grad * 2, True)
+        # get value after removing hook
+        ret1_grad_rm, linear1_w_grad_rm, linear1_b_grad_rm = run_double_hook_in_model(
+            data, label, lambda grad: grad * 2, True, True)
+
+        # compare original value and with hook
+        self.assertTrue(np.array_equal(ret1_grad, ret1_grad_hook))
+        self.assertTrue(np.array_equal(linear1_w_grad * 2, linear1_w_grad_hook))
+        self.assertTrue(np.array_equal(linear1_b_grad * 2, linear1_b_grad_hook))
+
+        # compare original value and remove hook
+        self.assertTrue(np.array_equal(ret1_grad, ret1_grad_rm))
+        self.assertTrue(np.array_equal(linear1_w_grad, linear1_w_grad_rm))
+        self.assertTrue(np.array_equal(linear1_b_grad, linear1_b_grad_rm))
+
+    def test_multiple_hooks_for_interior_var(self):
+        def run_multiple_hooks_for_interior_var(device,
+                                                hooks,
+                                                remove1=False,
+                                                remove2=False,
+                                                remove3=False):
+            paddle.set_device(device)
+
+            x = paddle.to_tensor([0., 1., 2., 3.])
+            y = paddle.to_tensor([4., 5., 6., 7.])
+            x.stop_gradient = False
+            y.stop_gradient = False
+
+            w = x + y
+            w.stop_gradient = False
+
+            helpers = []
+            for hook in hooks:
+                helper = w.register_hook(hook)
+                helpers.append(helper)
+
+            z = paddle.to_tensor([1., 2., 3., 4.])
+            z.stop_gradient = False
+
+            o = z.matmul(w)
+
+            if remove1:
+                helpers[0].remove()
+            if remove2:
+                helpers[1].remove()
+            if remove3:
+                helpers[2].remove()
+
+            o.backward()
+
+            return z.numpy(), w.grad, x.grad, y.grad
+
+        def double_hook(grad):
+            return grad * 2
+
+        hooks = [double_hook, double_hook, double_hook]
+
+        for device in self.devices:
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z * 8))
+            self.assertTrue(np.array_equal(y_grad, z * 8))
+
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks, remove1=True)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z * 4))
+            self.assertTrue(np.array_equal(y_grad, z * 4))
+
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks, remove2=True)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z * 4))
+            self.assertTrue(np.array_equal(y_grad, z * 4))
+
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks, remove3=True)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z * 4))
+            self.assertTrue(np.array_equal(y_grad, z * 4))
+
+            z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var(
+                device, hooks, remove1=True, remove2=True, remove3=True)
+
+            self.assertTrue(np.array_equal(w_grad, z))
+            self.assertTrue(np.array_equal(x_grad, z))
+            self.assertTrue(np.array_equal(y_grad, z))
+
+    def test_hook_in_double_grad(self):
+        def double_print_hook(grad):
+            grad = grad * 2
+            print(grad)
+            return grad
+
+        x = paddle.ones(shape=[1], dtype='float32')
+        x.stop_gradient = False
+
+        # hook only works in backward
+        # for forward var x, the x.grad generated in
+        # paddle.grad will not deal with by hook
+        x.register_hook(double_print_hook)
+
+        y = x * x
+
+        # Since y = x * x, dx = 2 * x
+        dx = paddle.grad(
+            outputs=[y], inputs=[x], create_graph=True, retain_graph=True)[0]
+
+        z = y + dx
+        self.assertTrue(x.grad is None)
+
+        # If create_graph = True, the gradient of dx
+        # would be backpropagated. Therefore,
+        # z = x * x + dx = x * x + 2 * x, and
+        # x.gradient() = 2 * x + 2 = 4.0
+        # after changed by hook: 8.0
+
+        z.backward()
+        self.assertTrue(np.array_equal(x.grad, np.array([8.])))
+
+    def test_remove_one_hook_multiple_times(self):
+        for device in self.devices:
+            paddle.set_device(device)
+
+            x = paddle.to_tensor([1., 2., 3., 4.])
+            x.stop_gradient = False
+
+            h = x.register_hook(lambda grad: grad * 2)
+            self.assertTrue(h.remove())
+            self.assertFalse(h.remove())
+
+    def test_register_hook_for_stop_gradient_var(self):
+        for device in self.devices:
+            paddle.set_device(device)
+
+            x = paddle.to_tensor([1., 2., 3., 4.])
+
+            with self.assertRaises(RuntimeError):
+                x.register_hook(lambda grad: grad * 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
index 38543fecac85ef..cb5186468890d8 100644
--- a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
+++ b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py
@@ -18,6 +18,7 @@
 import six
 import unittest
 import paddle.nn as nn
+import os
 
 
 class SimpleFCLayer(nn.Layer):
@@ -115,36 +116,41 @@ def test_save_inference_model_err(self):
             dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
                 self.layer, [in_x])
 
-            dirname = './traced_layer_err_msg'
+            path = './traced_layer_err_msg'
             with self.assertRaises(TypeError) as e:
                 traced_layer.save_inference_model([0])
             self.assertEqual(
-                "The type of 'dirname' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'str'>, but received <{} 'list'>. ".
+                "The type of 'path' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'str'>, but received <{} 'list'>. ".
                 format(self.type_str, self.type_str), str(e.exception))
             with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model(dirname, [0], [None])
+                traced_layer.save_inference_model(path, [0], [None])
             self.assertEqual(
                 "The type of 'each element of fetch' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'int'>, but received <{} 'NoneType'>. ".
                 format(self.type_str, self.type_str), str(e.exception))
             with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model(dirname, [0], False)
+                traced_layer.save_inference_model(path, [0], False)
             self.assertEqual(
                 "The type of 'fetch' in fluid.dygraph.jit.TracedLayer.save_inference_model must be (<{} 'NoneType'>, <{} 'list'>), but received <{} 'bool'>. ".
                 format(self.type_str, self.type_str, self.type_str),
                 str(e.exception))
             with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model(dirname, [None], [0])
+                traced_layer.save_inference_model(path, [None], [0])
             self.assertEqual(
                 "The type of 'each element of feed' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'int'>, but received <{} 'NoneType'>. ".
                 format(self.type_str, self.type_str), str(e.exception))
             with self.assertRaises(TypeError) as e:
-                traced_layer.save_inference_model(dirname, True, [0])
+                traced_layer.save_inference_model(path, True, [0])
             self.assertEqual(
                 "The type of 'feed' in fluid.dygraph.jit.TracedLayer.save_inference_model must be (<{} 'NoneType'>, <{} 'list'>), but received <{} 'bool'>. ".
                 format(self.type_str, self.type_str, self.type_str),
                 str(e.exception))
+            with self.assertRaises(ValueError) as e:
+                traced_layer.save_inference_model("")
+            self.assertEqual(
+                "The input path MUST be format of dirname/file_prefix [dirname\\file_prefix in Windows system], "
+                "but received file_prefix is empty string.", str(e.exception))
 
-            traced_layer.save_inference_model(dirname)
+            traced_layer.save_inference_model(path)
 
     def _train_simple_net(self):
         layer = None
@@ -174,5 +180,25 @@ def test_linear_net_with_none(self):
                                                                         [in_x])
 
 
+class TestTracedLayerSaveInferenceModel(unittest.TestCase):
+    """test save_inference_model will automaticlly create non-exist dir"""
+
+    def setUp(self):
+        self.save_path = "./nonexist_dir/fc"
+        import shutil
+        if os.path.exists(os.path.dirname(self.save_path)):
+            shutil.rmtree(os.path.dirname(self.save_path))
+
+    def test_mkdir_when_input_path_non_exist(self):
+        fc_layer = SimpleFCLayer(3, 4, 2)
+        input_var = paddle.to_tensor(np.random.random([4, 3]).astype('float32'))
+        with fluid.dygraph.guard():
+            dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace(
+                fc_layer, inputs=[input_var])
+            self.assertFalse(os.path.exists(os.path.dirname(self.save_path)))
+            traced_layer.save_inference_model(self.save_path)
+            self.assertTrue(os.path.exists(os.path.dirname(self.save_path)))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index f72df8cbe46409..59b4afdf8b02d2 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -23,6 +23,7 @@
 
 paddle.enable_static()
 
+
 class TestTransposeOp(OpTest):
     def setUp(self):
         self.init_op_type()
@@ -151,6 +152,7 @@ def test_each_elem_value_check():
 
             self.assertRaises(ValueError, test_each_elem_value_check)
 
+
 class TestTransposeApi(unittest.TestCase):
     def test_static_out(self):
         paddle.enable_static()
@@ -161,10 +163,11 @@ def test_static_out(self):
             place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             x_np = np.random.random([2, 3, 4]).astype("float32")
-            result1, result2 = exe.run(feed={"x": x_np}, fetch_list=[x_trans1, x_trans2])
+            result1, result2 = exe.run(feed={"x": x_np},
+                                       fetch_list=[x_trans1, x_trans2])
             expected_result1 = np.transpose(x_np, [1, 0, 2])
             expected_result2 = np.transpose(x_np, (2, 1, 0))
-            
+
             np.testing.assert_array_equal(result1, expected_result1)
             np.testing.assert_array_equal(result2, expected_result2)
 
@@ -185,6 +188,7 @@ def test_dygraph_out(self):
         # dygraph test
         paddle.enable_static()
 
+
 class TestTAPI(unittest.TestCase):
     def test_out(self):
         with fluid.program_guard(fluid.Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index 8d5ab0a5be757a..690ac46e563ef0 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -190,7 +190,6 @@ def test_fake_interface_only_api(self):
         with fluid.dygraph.guard():
             self.assertRaises(AssertionError, var.detach)
             self.assertRaises(AssertionError, var.numpy)
-            self.assertRaises(AssertionError, var.set_value, None)
             self.assertRaises(AssertionError, var.backward)
             self.assertRaises(AssertionError, var.gradient)
             self.assertRaises(AssertionError, var.clear_gradient)
diff --git a/python/paddle/fluid/transpiler/ascend_transpiler.py b/python/paddle/fluid/transpiler/ascend_transpiler.py
new file mode 100644
index 00000000000000..5593c91b5bc646
--- /dev/null
+++ b/python/paddle/fluid/transpiler/ascend_transpiler.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import collective
+from .. import core
+OpRole = core.op_proto_and_checker_maker.OpRole
+from paddle.distributed import fleet
+
+
+class AscendTranspiler(collective.Collective):
+    def __init__(self, startup_program, main_program):
+        self.nrings = 1
+        super(AscendTranspiler, self).__init__(self.nrings)
+        self._startup_program = startup_program
+        self._main_program = main_program
+
+    def _insert_allreduce_ops(self):
+        block = self._main_program.global_block()
+        ring_id = -1
+        grad = None
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if self._is_backward_op(op) and \
+                    self.op_role_var_key in op.attr_names:
+                op_role_var = op.all_attrs()[self.op_role_var_key]
+
+                if len(op_role_var) == 0:
+                    continue
+                assert len(op_role_var) % 2 == 0
+
+                offset = idx
+                for i in range(0, len(op_role_var), 2):
+                    param = block.vars[op_role_var[i]]
+                    grad = block.vars[op_role_var[i + 1]]
+                    if param.is_distributed:
+                        continue
+
+                    # As we search ops reversedly, we should insert c_allreduce_sum
+                    # op in the same way to keep the ring_id alternate
+                    ring_id = (ring_id + 1) % self.nrings
+                    block._insert_op(
+                        offset + 1,
+                        type='c_allreduce_sum',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={
+                            'ring_id': ring_id,
+                            self.op_role_key: OpRole.Backward
+                        })
+                    block._insert_op(
+                        offset + 2,
+                        type='scale',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={
+                            'scale': 1.0 / fleet.worker_num(),
+                            self.op_role_key: OpRole.Backward
+                        })
+
+        if grad is None:
+            return
+
+    def transpile(self):
+        self._insert_allreduce_ops()
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 3d93bed32ecc4d..3b953efab71c47 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -22,13 +22,18 @@
 import sys
 import numpy as np
 
+if not six.PY2:
+    import copyreg
+
 import paddle
 
 # deprecated module import
 from paddle import fluid
 from paddle.fluid import core
-from paddle.fluid.io import _unpack_saved_dict, _pack_loaded_dict
-from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer
+from paddle.fluid.io import _unpack_saved_dict, _pack_loaded_dict, _pickle_loads_mac
+from paddle.fluid.io import _legacy_save as _legacy_static_save
+
+from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, in_dygraph_mode, ParamBase, _current_expected_place
 from paddle.fluid.dygraph.jit import _SaveLoadConfig
 from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
@@ -181,7 +186,9 @@ def _build_load_path_and_config(path, config):
 
 
 def _parse_load_config(configs):
-    supported_configs = ['model_filename', 'params_filename', 'keep_name_table']
+    supported_configs = [
+        'model_filename', 'params_filename', 'keep_name_table', 'return_numpy'
+    ]
 
     # input check
     for key in configs:
@@ -195,16 +202,158 @@ def _parse_load_config(configs):
     inner_config.model_filename = configs.get('model_filename', None)
     inner_config.params_filename = configs.get('params_filename', None)
     inner_config.keep_name_table = configs.get('keep_name_table', None)
+    inner_config.return_numpy = configs.get('return_numpy', False)
 
     return inner_config
 
 
-def save(obj, path, pickle_protocol=2):
+def _parse_save_config(configs):
+    supported_configs = ['use_binary_format', 'pickle_protocol']
+
+    # input check
+    for key in configs:
+        if key not in supported_configs:
+            raise ValueError(
+                "The additional config (%s) of `paddle.save` is not supported."
+                % key)
+
+    # construct inner config
+    inner_config = _SaveLoadConfig()
+    inner_config.use_binary_format = configs.get('use_binary_format', False)
+    inner_config.pickle_protocol = configs.get('pickle_protocol', None)
+
+    return inner_config
+
+
+def _pickle_save(obj, f, protocol):
+    # TODO(weixin):add support for BytesIO.
+    if not isinstance(protocol, int):
+        raise ValueError("The 'protocol' MUST be `int`, but received {}".format(
+            type(protocol)))
+
+    if protocol < 2 or protocol > 4:
+        raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
+                         format(protocol))
+
+    if not isinstance(obj, (core.LoDTensor, core.VarBase)):
+        raise NotImplementedError(
+            "Support 'paddle.Tensor' or 'paddle.core.LoDTensor', but received {}.".
+            format(type(obj)))
+
+    def reudce_varbase(self):
+        data = self.numpy()
+        name = self.name
+
+        return (tuple, ((name, data), ))
+
+    def reduce_LoDTensor(self):
+        data = np.array(self)
+
+        return (eval, ('data', {'data': data}))
+
+    def add_dispatch_table():
+        # This is not a good method, because the pickle module has been modified.
+        pickle.dispatch_table[core.VarBase] = reudce_varbase
+        pickle.dispatch_table[ParamBase] = reudce_varbase
+        pickle.dispatch_table[core.LoDTensor] = reduce_LoDTensor
+
+    def pop_dispatch_table():
+        pickle.dispatch_table.pop(core.VarBase)
+        pickle.dispatch_table.pop(core.LoDTensor)
+        pickle.dispatch_table.pop(ParamBase)
+
+    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+    if sys.platform == 'darwin' and sys.version_info.major == 3:
+        add_dispatch_table()
+        pickle_bytes = pickle.dumps(obj)
+        pop_dispatch_table()
+
+        max_bytes = 2**30
+        for i in range(0, len(pickle_bytes), max_bytes):
+            f.write(pickle_bytes[i:i + max_bytes])
+    else:
+        if six.PY2:
+            add_dispatch_table()
+            pickle_bytes = pickle.dump(obj, f, protocol)
+            pop_dispatch_table()
+        else:
+            pickler = pickle.Pickler(f, protocol)
+            pickler.dispatch_table = copyreg.dispatch_table.copy()
+
+            pickler.dispatch_table[core.VarBase] = reudce_varbase
+            pickler.dispatch_table[core.LoDTensor] = reduce_LoDTensor
+            pickler.dispatch_table[ParamBase] = reudce_varbase
+
+            pickler.dump(obj)
+
+
+def _use_legacy(obj):
+    # TODO(weixin):If `obj` is any object, the judgment condition should be more precise.
+    if not isinstance(obj, dict):
+        return False
+    return True
+
+
+def _transformed_from_varbase(obj):
+    # In paddle2.1 version, VarBase is saved as tuple(tensor.name, tensor.numpy()).
+    # When executing paddle.load, use this function to determine whether to restore to VarBase/LoDTensor.
+    if isinstance(obj, tuple) and len(obj) == 2:
+        if six.PY2:
+            name_types = (str, unicode)
+        else:
+            name_types = str
+        if isinstance(obj[0], name_types) and isinstance(obj[1], np.ndarray):
+            return True
+    return False
+
+
+def _transformed_from_lodtensor(obj):
+    # In paddle2.1 version, LoDTensor is saved as np.array(tensor).
+    # When executing paddle.load, use this function to determine whether to restore to VarBase/LoDTensor.
+    if isinstance(obj, np.ndarray):
+        return True
+    return False
+
+
+def _to_LodTensor(ndarray):
+    if not isinstance(ndarray, np.ndarray):
+        raise TypeError(
+            'Type of `ndarray` should be numpy.ndarray, but received {}.'.
+            format(type(ndarray)))
+    t = core.LoDTensor()
+    place = _current_expected_place()
+    t.set(ndarray, place)
+    return t
+
+
+def _tuple_to_tensor(obj, return_numpy):
+    if return_numpy:
+        return obj[1]
+    if in_dygraph_mode():
+        t = paddle.to_tensor(obj[1])
+        # This function does modify the name of return value.
+        # Loading the same variable multiple times may cause the same name.
+        t.name = obj[0]
+        return t
+    else:
+        return _to_LodTensor(obj[1])
+
+
+def _ndarray_to_tensor(obj, return_numpy):
+    if return_numpy:
+        return obj
+    if in_dygraph_mode():
+        return paddle.to_tensor(obj)
+    else:
+        return _to_LodTensor(obj)
+
+
+def save(obj, path, protocol=2, **configs):
     '''
     Save an object to the specified path.
     
     .. note::
-        Now only supports save ``state_dict`` of Layer or Optimizer.
+        Now supports saving ``state_dict`` of Layer or Optimizer, Tensor.
 
     .. note::
         Different from ``paddle.jit.save``, since the save result of ``paddle.save`` is a single file, 
@@ -219,8 +368,12 @@ def save(obj, path, pickle_protocol=2):
         obj(Object) : The object to be saved.
         path(str) : The path of the object to be saved. 
           If saved in the current directory, the input path string will be used as the file name. 
-        pickle_protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
+        protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
                                  Default: 2
+        **configs(dict, optional): optional keyword arguments. The following options are currently supported:
+          use_binary_format(bool): When the saved object is static graph variable, you can specify ``use_binary_for_var``. 
+          If True, save the file in the c++ binary format when saving a single static graph variable; otherwise, save it in pickle format.
+          Default: False
 
     Returns:
         None
@@ -228,20 +381,91 @@ def save(obj, path, pickle_protocol=2):
     Examples:
         .. code-block:: python
 
+            # example 1: dynamic graph
             import paddle
-
             emb = paddle.nn.Embedding(10, 10)
             layer_state_dict = emb.state_dict()
+
+            # save state_dict of emb
             paddle.save(layer_state_dict, "emb.pdparams")
-            scheduler = paddle.optimizer.lr.NoamDecay(	
+
+            scheduler = paddle.optimizer.lr.NoamDecay(
                 d_model=0.01, warmup_steps=100, verbose=True)
             adam = paddle.optimizer.Adam(
                 learning_rate=scheduler,
                 parameters=emb.parameters())
             opt_state_dict = adam.state_dict()
+
+            # save state_dict of optimizer
             paddle.save(opt_state_dict, "adam.pdopt")
+            # save weight of emb
+            paddle.save(emb.weight, "emb.weight.pdtensor")
+
+            # example 2: static graph
+            import paddle
+            import paddle.static as static
+
+            paddle.enable_static()
+
+            # create network
+            x = paddle.static.data(name="x", shape=[None, 224], dtype='float32')
+            z = paddle.static.nn.fc(x, 10)
+
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            prog = paddle.static.default_main_program()
+            for var in prog.list_vars():
+                if list(var.shape) == [224, 10]:
+                    tensor = var.get_tensor()
+                    break
+
+            # save/load tensor
+            path_tensor = 'temp/tensor.pdtensor'
+            paddle.save(tensor, path_tensor)
+
+            # save/load state_dict
+            path_state_dict = 'temp/model.pdparams'
+            paddle.save(prog.state_dict("param"), path_tensor)
     '''
+    # 1. input check
+    filename = os.path.basename(path)
+    if filename == "":
+        raise ValueError("The input path MUST be format of dirname/filename "
+                         "[dirname\\filename in Windows system], but received "
+                         "filename is empty string.")
+
+    # 2. save object
+    dirname = os.path.dirname(path)
+    if dirname and not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+    config = _parse_save_config(configs)
+
+    if not isinstance(config.use_binary_format, bool):
+        raise TypeError(
+            "Type of `use_binary_format` should be bool, but received {}.".
+            format(type(config.use_binary_format)))
+
+    # `protocol` need to be used, `pickle_protocol` is a deprecated arg.
+    if config.pickle_protocol is not None:
+        protocol = config.pickle_protocol
+        warnings.warn(
+            "'pickle_protocol' is a deprecated argument. Please use 'protocol' instead."
+        )
+
+    if _use_legacy(obj):
+        if in_dygraph_mode():
+            _legacy_save(obj, path, protocol)
+        else:
+            _legacy_static_save(obj, path, protocol)
+    else:
+        # save single variable
+        with open(path, 'wb') as f:
+            _pickle_save(obj, f, protocol)
+
 
+def _legacy_save(obj, path, protocol=2):
     # 1. input check
     if not isinstance(obj, dict):
         raise NotImplementedError(
@@ -257,13 +481,13 @@ def save(obj, path, pickle_protocol=2):
                          "[dirname\\filename in Windows system], but received "
                          "filename is empty string.")
 
-    if not isinstance(pickle_protocol, int):
+    if not isinstance(protocol, int):
         raise ValueError("The 'protocol' MUST be `int`, but received {}".format(
-            type(pickle_protocol)))
+            type(protocol)))
 
-    if pickle_protocol < 2 or pickle_protocol > 4:
+    if protocol < 2 or protocol > 4:
         raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
-                         format(pickle_protocol))
+                         format(protocol))
 
     # 2. save object
     dirname = os.path.dirname(path)
@@ -274,19 +498,18 @@ def save(obj, path, pickle_protocol=2):
     if isinstance(obj, dict):
         saved_obj = _build_saved_state_dict(obj)
 
-    saved_obj = _unpack_saved_dict(saved_obj, pickle_protocol)
+    saved_obj = _unpack_saved_dict(saved_obj, protocol)
 
-    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3.5/6'
-    if sys.platform == 'darwin' and sys.version_info.major == 3 and (
-            sys.version_info.minor == 5 or sys.version_info.minor == 6):
-        pickle_bytes = pickle.dumps(saved_obj, protocol=pickle_protocol)
+    # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+    if sys.platform == 'darwin' and sys.version_info.major == 3:
+        pickle_bytes = pickle.dumps(saved_obj, protocol=protocol)
         with open(path, 'wb') as f:
             max_bytes = 2**30
             for i in range(0, len(pickle_bytes), max_bytes):
                 f.write(pickle_bytes[i:i + max_bytes])
     else:
         with open(path, 'wb') as f:
-            pickle.dump(saved_obj, f, protocol=pickle_protocol)
+            pickle.dump(saved_obj, f, protocol=protocol)
 
 
 def load(path, **configs):
@@ -294,7 +517,7 @@ def load(path, **configs):
     Load an object can be used in paddle from specified path.
 
     .. note::
-        Now only supports load ``state_dict`` of Layer or Optimizer.
+        Now supports load ``state_dict`` of Layer or Optimizer, Tensor.
 
     .. note::
         In order to use the model parameters saved by paddle more efficiently, 
@@ -331,7 +554,9 @@ def load(path, **configs):
             ``save_inference_model`` save format. Default file name is :code:`__model__` . 
             (2) params_filename (str): The persistable variables file name of the paddle 1.x 
             ``save_inference_model`` save format. No default file name, save variables separately 
-            by default.
+            by default.            
+            (3) return_numpy(bool): If specified as True, return tensor as numpy.ndarray, otherwise return tensor as paddle.Tensor. 
+            Default False.
 
     Returns:
         Object(Object): a target object can be used in paddle
@@ -341,20 +566,115 @@ def load(path, **configs):
 
             import paddle
 
+            # example 1: dynamic graph
+            import paddle
             emb = paddle.nn.Embedding(10, 10)
             layer_state_dict = emb.state_dict()
+
+            # save state_dict of emb
             paddle.save(layer_state_dict, "emb.pdparams")
-            scheduler = paddle.optimizer.lr.NoamDecay(	
+
+            scheduler = paddle.optimizer.lr.NoamDecay(
                 d_model=0.01, warmup_steps=100, verbose=True)
             adam = paddle.optimizer.Adam(
                 learning_rate=scheduler,
                 parameters=emb.parameters())
             opt_state_dict = adam.state_dict()
+
+            # save state_dict of optimizer
             paddle.save(opt_state_dict, "adam.pdopt")
+            # save weight of emb
+            paddle.save(emb.weight, "emb.weight.pdtensor")
 
+            # load state_dict of emb
             load_layer_state_dict = paddle.load("emb.pdparams")
+            # load state_dict of optimizer
             load_opt_state_dict = paddle.load("adam.pdopt")
+            # load weight of emb
+            load_weight = paddle.load("emb.weight.pdtensor")
+
+
+            # example 2: static graph
+            import paddle
+            import paddle.static as static
+
+            paddle.enable_static()
+
+            # create network
+            x = paddle.static.data(name="x", shape=[None, 224], dtype='float32')
+            z = paddle.static.nn.fc(x, 10)
+
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            prog = paddle.static.default_main_program()
+            for var in prog.list_vars():
+                if list(var.shape) == [224, 10]:
+                    tensor = var.get_tensor()
+                    break
+
+            # save/load tensor
+            path_tensor = 'temp/tensor.pdtensor'
+            paddle.save(tensor, path_tensor)
+            load_tensor = paddle.load(path_tensor)
+
+            # save/load state_dict
+            path_state_dict = 'temp/model.pdparams'
+            paddle.save(prog.state_dict("param"), path_tensor)
+            load_state_dict = paddle.load(path_tensor)
+
     '''
+
+    if os.path.isfile(path):
+        config = _parse_load_config(configs)
+        with open(path, 'rb') as f:
+            # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+            if sys.platform == 'darwin' and sys.version_info.major == 3:
+                load_result = _pickle_loads_mac(path, f)
+            else:
+                load_result = pickle.load(f) if six.PY2 else pickle.load(
+                    f, encoding='latin1')
+
+            # TODO(weixin):If `obj` is any object, the judgment condition should be more precise.
+            if isinstance(load_result, dict):
+                if isinstance(load_result, dict):
+                    load_result = _pack_loaded_dict(load_result)
+                # paddle2.0: paddle.save/load
+                if "StructuredToParameterName@@" in load_result:
+
+                    for key in load_result["StructuredToParameterName@@"]:
+                        load_result[key] = _ndarray_to_tensor(
+                            load_result[key], config.return_numpy)
+
+                    if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
+                        del load_result["StructuredToParameterName@@"]
+                else:
+                    # paddle2.1 static.save/load
+                    for key in load_result:
+                        load_result[key] = _ndarray_to_tensor(
+                            load_result[key], config.return_numpy)
+
+            else:
+                # TODO(weixin): support complex objects such as layer.
+                # If `obj` is any object, the judgment condition should be more precise.
+                if _transformed_from_lodtensor(load_result):
+                    load_result = _ndarray_to_tensor(load_result,
+                                                     config.return_numpy)
+                elif _transformed_from_varbase(load_result):
+                    load_result = _tuple_to_tensor(load_result,
+                                                   config.return_numpy)
+                else:
+                    raise NotImplementedError(
+                        'Only support tensor and state_dict, but received {}.'.
+                        format(type(load_result)))
+
+    else:
+        load_result = _legacy_load(path, **configs)
+
+    return load_result
+
+
+def _legacy_load(path, **configs):
     load_result = None
     config = _parse_load_config(configs)
 
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 75dc62e530d0db..3a520615625324 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -112,10 +112,6 @@ def _conv_nd(x,
 
     # Due to the poor performance of NHWC, we transpose the input to NCHW.
     origin_format = data_format
-    if origin_format == "NHWC" and op_type == "depthwise_conv2d":
-        x = nn.transpose(x, perm=[0, 3, 1, 2])
-        data_format = "NCHW"
-        channel_dim = 1
     if in_dygraph_mode():
         attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn',
@@ -159,10 +155,6 @@ def _conv_nd(x,
                        'use_mkldnn': use_mkldnn})
         else:
             out = pre_bias
-
-    if origin_format == "NHWC" and op_type == "depthwise_conv2d":
-        out = nn.transpose(out, perm=[0, 2, 3, 1])
-
     return out
 
 
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index bf389717518ce2..b88a2b042ff481 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -148,9 +148,7 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None):
         sparse(bool): The flag indicating whether to use sparse update. This parameter only
             affects the performance of the backwards gradient update. It is recommended to set
             True because sparse update is faster. But some optimizers does not support sparse update,
-            such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
-            :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
-            :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
+            such as :ref:`api_paddle_optimizer_adadelta_Adadelta` , :ref:`api_paddle_optimizer_adamax_Adamax` , :ref:`api_paddle_optimizer_lamb_Lamb`.
             In these cases, sparse must be False. Default: False.
         padding_idx(int|long|None): padding_idx needs to be in the interval [-weight.shape[0], weight.shape[0]).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 1dad1632e264a6..6c8a2d1cbce850 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1388,6 +1388,8 @@ def cross_entropy(input,
             "should be '-100', but received %s, which is not allowed." %
             ignore_index)
 
+    softmax_switch = use_softmax
+
     input_dims = len(list(input.shape))
     label_dims = len(list(label.shape))
     if input_dims - 1 != label_dims and input_dims != label_dims:
@@ -1400,7 +1402,7 @@ def cross_entropy(input,
         _, out = core.ops.softmax_with_cross_entropy(
             input, label, 'soft_label', soft_label, 'ignore_index',
             ignore_index, 'numeric_stable_mode', True, 'axis', axis,
-            'use_softmax', use_softmax)
+            'softmax_switch', softmax_switch)
 
         if weight is not None:
 
@@ -1482,7 +1484,7 @@ def cross_entropy(input,
         'ignore_index': ignore_index,
         'numeric_stable_mode': True,
         'axis': axis,
-        'use_softmax': use_softmax
+        'softmax_switch': softmax_switch
     }
     helper = LayerHelper('softmax_with_cross_entropy', **locals())
     softmax = helper.create_variable_for_type_inference(dtype=input.dtype)
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 9e04095e7b7988..032d5b47eda077 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -119,6 +119,8 @@ def affine_grid(theta, out_shape, align_corners=True, name=None):
         use_cudnn = True
     else:
         use_cudnn = False
+    if core.is_compiled_with_rocm():
+        use_cudnn = False  # ROCM platform do not have MIOPEN kernel for affine_grid
 
     if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
             isinstance(out_shape, Variable)):
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 60c846f9f76ec0..86a6fae0d6857f 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -1229,7 +1229,7 @@ class Embedding(layers.Layer):
     For specific usage, refer to code examples. It implements the function of the Embedding Layer.
     This layer is used to lookup embeddings vector of ids provided by :attr:`x` .
     It automatically constructs a 2D embedding matrix based on the
-    input :attr:`num_embeddings` and attr:`embedding_dim`.
+    input :attr:`num_embeddings` and :attr:`embedding_dim`.
 
     The shape of output Tensor is generated by appending an emb_size dimension to the
     last dimension of the input Tensor shape.
@@ -1241,9 +1241,9 @@ class Embedding(layers.Layer):
 
         Case 1:
 
-        input is a Tensor. padding_idx = -1
-            input.data = [[1, 3], [2, 4], [4, 127]
-            input.shape = [3, 2]
+        x is a Tensor. padding_idx = -1
+            x.data = [[1, 3], [2, 4], [4, 127]
+            x.shape = [3, 2]
         Given size = [128, 16]
         output is a Tensor:
             out.shape = [3, 2, 16]
@@ -1261,7 +1261,7 @@ class Embedding(layers.Layer):
     Parameters:
         num_embeddings (int): Just one element which indicate the size
             of the dictionary of embeddings.
-        embedding_dim:  Just one element which indicate the size of each embedding vector respectively.
+        embedding_dim (int):  Just one element which indicate the size of each embedding vector respectively.
         padding_idx(int|long|None): padding_idx needs to be in the interval [-num_embeddings, num_embeddings).
             If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted
             to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup
@@ -1270,9 +1270,7 @@ class Embedding(layers.Layer):
         sparse(bool): The flag indicating whether to use sparse update. This parameter only
             affects the performance of the backwards gradient update. It is recommended to set
             True because sparse update is faster. But some optimizer does not support sparse update,
-            such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` ,
-            :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` ,
-            :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` .
+            such as :ref:`api_paddle_optimizer_adadelta_Adadelta` , :ref:`api_paddle_optimizer_adamax_Adamax` , :ref:`api_paddle_optimizer_lamb_Lamb`.
             In these case, sparse must be False. Default: False.
         weight_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the
             default weight parameter property is used. See usage for details in :ref:`api_ParamAttr` . In addition,
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 5830af3a182d4f..cdb87a1cb39207 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -33,7 +33,7 @@
 
 
 class AvgPool1D(layers.Layer):
-    """
+    r"""
     This operation applies a 1D average pooling over an input signal composed
     of several input planes, based on the input, output_size, return_mask parameters.
     Input(X) and output(Out) are in NCL format, where N is batch
@@ -41,36 +41,33 @@ class AvgPool1D(layers.Layer):
     The output tensor shape will be [N, C, output_size].
 
     The output value of the layer with input size (N, C, L),
-    output (N, C, L_{out}) and kernel_size k can be precisely described as
+    output (N, C, :math:`L_{out}`) and kernel_size ksize can be precisely described as
     For average pool1d:
 
     ..  math::
 
-       Output(N_i, C_i, l) &= mean(Input[N_i, C_i, stride \times l:stride \times l+k])
+        Output(N_i, C_i, l) = \frac{Input[N_i, C_i, stride \times l:stride \times l+k]}{ksize}
 
-
-    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain an integer.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain an integer.
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
+            it must contain an integer. Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An int, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
             4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is `True`.
-        ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
-            If it is set to False, the floor function will be used. The default value is False.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        exclusive(bool, optional): Whether to exclude padding points in average pooling mode, default is `True`.
+        ceil_mode(bool, optional): ${ceil_mode_comment}Whether to use the ceil function to calculate output height
+            and width. If it is set to False, the floor function will be used. The default value is False.
+        name(str, optional): For eed to detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no nset and None by default.
 
     Returns:
-        None.
+        A callable object of AvgPool1D.
 
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
@@ -79,23 +76,24 @@ class AvgPool1D(layers.Layer):
         ShapeError: If the input is not a 3-D tensor.
         ShapeError: If the output's shape calculated is not greater than 0.
 
-
     Shape:
-        - inpuut: 3-D tensor.
-        - output: 3-D tensor
+        - x(Tensor): The input tensor of avg pool1d operator, which is a 3-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of avg pool1d  operator, which is a 3-D tensor.
+          The data type is same as input x.
 
     Examples:
 
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AvgPool1D = nn.AvgPool1D(kernel_size=2, stride=2, padding=0)
-          pool_out = AvgPool1D(data)
-          # pool_out shape: [1, 3, 16]
+            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+            AvgPool1D = nn.AvgPool1D(kernel_size=2, stride=2, padding=0)
+            pool_out = AvgPool1D(data)
+            # pool_out shape: [1, 3, 16]
 
     """
 
@@ -132,49 +130,53 @@ class AvgPool2D(layers.Layer):
     H is the height of the feature, and W is the width of the feature.
 
     Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w)  = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
-                               input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
-           $$
-
-    Args:
-       kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+        Input:
+            X shape: :math:`(N, C, :math:`H_{in}`, :math:`W_{in}`)`
+        Attr:
+            kernel_size: ksize
+
+        Output:
+            Out shape: :math:`(N, C, :math:`H_{out}`, :math:`W_{out}`)`
+
+        ..  math::
+
+            Output(N_i, C_j, h, w)  = \frac{\sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1}
+                Input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)}{ksize[0] * ksize[1]}
+
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
             Otherwise, the pool kernel size will be a square of an int.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
             it must contain two integers, (pool_stride_Height, pool_stride_Width).
             Otherwise, the pool stride size will be a square of an int.
-
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An int, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
             4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is `true`.
-        divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        ceil_mode(bool, optional): When True, will use `ceil` instead of `floor` to compute the output shape.
+        exclusive(bool, optional): Whether to exclude padding points in average pooling
+            mode, default is `true`.
+        divisor_override(float, optional): If specified, it will be used as divisor, otherwise kernel_size will be
+            used. Default None.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCHW"`,
+            `"NDHW"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
 
     Shape:
-        - x: 4-D tensor.
-        - out: 2-D tensor
+        - x(Tensor): The input tensor of avg pool2d operator, which is a 4-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of avg pool2d  operator, which is a 4-D tensor.
+          The data type is same as input x.
 
-    Returns: None.
+    Returns:
+        A callable object of AvgPool2D.
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
@@ -182,16 +184,16 @@ class AvgPool2D(layers.Layer):
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          # max pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          AvgPool2D = nn.AvgPool2D(kernel_size=2,
+            # max pool2d
+            input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+            AvgPool2D = nn.AvgPool2D(kernel_size=2,
                                 stride=2, padding=0)
-          output = AvgPool2D(input)
-          # output.shape [1, 3, 16, 16]
+            output = AvgPool2D(input)
+            # output.shape [1, 3, 16, 16]
 
     """
 
@@ -238,61 +240,64 @@ class AvgPool3D(layers.Layer):
     in NCDHW format, where N is batch size, C is the number of channels,
     H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
 
-    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size
             is a tuple or list, it must contain three integers,
             (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
             Otherwise, the pool kernel size will be the cube of an int.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
             it must contain three integers, [stride_Depth, stride_Height, stride_Width).
             Otherwise, the pool stride size will be a cube of an int.
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An int, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
             4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        ceil_mode (bool): ${ceil_mode_comment}
-        exclusive (bool): Whether to exclude padding points in average pooling
-                          mode, default is True.
-        divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
-                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        ceil_mode(bool, optional): ${ceil_mode_comment}
+        exclusive(bool, optional): Whether to exclude padding points in average pooling mode, default is True.
+        divisor_override(int|float, optional): if specified, it will be used as divisor, otherwise kernel_size will
+            be used. Default None.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCDHW"`,
+             `"NDHWC"`. The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+             `[batch_size, input_channels, input_depth, input_height, input_width]`.
         name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+             to :ref:`api_guide_Name`. Usually name is no need to set and
+             None by default.
 
-    Returns: None.
+    Returns:
+        A callable object of AvgPool3D.
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ShapeError: If the output's shape calculated is not greater than 0.
 
     Shape:
-        - x: 5-D tensor.
-        - out: 5-D tensor.
-
+        - x(Tensor): The input tensor of avg pool3d operator, which is a 5-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of avg pool3d  operator, which is a 5-D tensor.
+          The data type is same as input x.
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          # avg pool3d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
-          AvgPool3D = nn.AvgPool3D(kernel_size=2,
+            # avg pool3d
+            input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+            AvgPool3D = nn.AvgPool3D(kernel_size=2,
                                    stride=2, padding=0)
-          output = AvgPool3D(input)
-          # output.shape [1, 2, 3, 16, 16]
+            output = AvgPool3D(input)
+            # output.shape [1, 2, 3, 16, 16]
 
     """
 
     def __init__(self,
                  kernel_size,
-                 stride,
+                 stride=None,
                  padding=0,
                  ceil_mode=False,
                  exclusive=True,
@@ -328,10 +333,11 @@ def extra_repr(self):
 
 class MaxPool1D(layers.Layer):
     """
-    Applies a 1D max pooling over an input signal composed of several input planes based
-    on the input, output_size, return_mask parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
+    This operation applies 1D max pooling over input signal
+    composed of several input planes based on the input,
+    and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
+    in NCL format, where N is batch size, C is the number of channels,
+    L is the length of the feature.
 
     The output value of the layer with input size (N, C, L),
     output (N, C, L_{out}) and kernel_size k can be precisely described as
@@ -339,28 +345,27 @@ class MaxPool1D(layers.Layer):
 
     ..  math::
 
-       Output(N_i, C_i, l) &=  max(Input[N_i, C_i, stride \times l:stride \times l+k])}
+        Output(N_i, C_i, l) =  max(Input[N_i, C_i, stride \times l:stride \times l+k])
 
-    Args:
-       kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain an integer.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain an integer.
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
+            it must contain an integer. Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An integer, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides.
-            4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
-            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
+            4. A list[int] or tuple(int) whose length is 2, It has the form [pad_before, pad_after].
+            5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or(0,0).
             The default value is 0.
-        return_mask (bool): Whether return the max indices along with the outputs. default is `False`.
-        ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
-            If it is set to False, the floor function will be used. Default False.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        return_mask(bool, optional): Whether return the max indices along with the outputs. default is `False`.
+        ceil_mode(bool, optional): Whether to use the ceil function to calculate output height and width.
+            False is the default. If it is set to False, the floor function will be used. Default False.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
     Returns:
-        None.
+        A callable object of MaxPool1D.
 
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
@@ -371,25 +376,27 @@ class MaxPool1D(layers.Layer):
 
 
     Shape:
-        - x: 3-D tensor.
-        - out: 3-D tensor.
+        - x(Tensor): The input tensor of max pool1d operator, which is a 3-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of max pool1d  operator, which is a 3-D tensor.
+          The data type is same as input x.
 
     Examples:
 
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0)
-          pool_out = MaxPool1D(data)
-          # pool_out shape: [1, 3, 16]
+            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+            MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0)
+            pool_out = MaxPool1D(data)
+            # pool_out shape: [1, 3, 16]
 
-          MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0, return_mask=True)
-          pool_out, indices = MaxPool1D(data)
-          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+            MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0, return_mask=True)
+            pool_out, indices = MaxPool1D(data)
+            # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
 
     """
 
@@ -426,70 +433,73 @@ class MaxPool2D(layers.Layer):
     H is the height of the feature, and W is the width of the feature.
 
     Example:
-      Input:
-           X shape: $(N, C, H_{in}, W_{in})$
-      Attr:
-           kernel_size: ksize
-
-      Output:
-           Out shape: $(N, C, H_{out}, W_{out})$
-           $$
-           out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\
-                                    & \text{input}(N_i, C_j, \text{stride[0]} \times h + m,
-                                                   \text{stride[1]} \times w + n)
-           $$
-
-    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+        - Input:
+            X shape: :math:`(N, C, H_{in}, W_{in})`
+        - Attr:
+            kernel_size: ksize
+
+        - Output:
+            Out shape: :math:`(N, C, H_{out}, W_{out})`
+
+        ..  math::
+
+            Output(N_i, C_j, h, w) = \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1}
+                Input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)
+
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
             Otherwise, the pool kernel size will be a square of an int.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
             it must contain two integers, (pool_stride_Height, pool_stride_Width).
             Otherwise, the pool stride size will be a square of an int.
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An int, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension.
-            4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            4. A list[int] or tuple(int) whose length is \4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_mask (bool): Whether to return the max indices along with the outputs.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
-                        The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        ceil_mode(bool, optional): when True, will use `ceil` instead of `floor` to compute the output shape
+        return_mask(bool, optional): Whether to return the max indices along with the outputs.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
+            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
 
-    Returns: None
+    Returns:
+        A callable object of MaxPool2D.
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ShapeError: If the output's shape calculated is not greater than 0.
 
     Shape:
-        - x: 4-D tensor.
-        - out: 4-D tensor.
+        - x(Tensor): The input tensor of max pool2d operator, which is a 4-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of max pool2d  operator, which is a 4-D tensor.
+          The data type is same as input x.
 
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          # max pool2d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          MaxPool2D = nn.MaxPool2D(kernel_size=2,
+            # max pool2d
+            input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+            MaxPool2D = nn.MaxPool2D(kernel_size=2,
                                    stride=2, padding=0)
-          output = MaxPool2D(input)
-          # output.shape [1, 3, 16, 16]
+            output = MaxPool2D(input)
+            # output.shape [1, 3, 16, 16]
 
-          # for return_mask=True
-          MaxPool2D = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, return_mask=True)
-          output, max_indices = MaxPool2D(input)
-          # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
+            # for return_mask=True
+            MaxPool2D = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, return_mask=True)
+            output, max_indices = MaxPool2D(input)
+            # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
     """
 
     def __init__(self,
@@ -532,59 +542,62 @@ class MaxPool3D(layers.Layer):
     in NCDHW format, where N is batch size, C is the number of channels,
     H is the height of the feature,  D is the depth of the feature, and W is the width of the feature.
 
-    Args:
-        kernel_size (int|list|tuple): The pool kernel size. If the kernel size
+    Parameters:
+        kernel_size(int|list|tuple): The pool kernel size. If the kernel size
             is a tuple or list, it must contain three integers,
             (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
             Otherwise, the pool kernel size will be the cube of an int.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+        stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
             it must contain three integers, [stride_Depth, stride_Height, stride_Width).
             Otherwise, the pool stride size will be a cube of an int.
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+            Default None, then stride will be equal to the kernel_size.
+        padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An int, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
-            4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
+            4. A list[int] or tuple(int) whose length is \6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        ceil_mode (bool): ${ceil_mode_comment}
-        return_mask (bool): Whether to return the max indices along with the outputs.
-        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
-                        The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
-                        `[batch_size, input_channels, input_depth, input_height, input_width]`.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        ceil_mode(bool, optional): ${ceil_mode_comment}
+        return_mask(bool, optional): Whether to return the max indices along with the outputs.
+        data_format(str, optional): The data format of the input and output data. An optional string from: `"NCDHW"`,
+            `"NDHWC"`. The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
 
 
-    Returns:None.
+    Returns:
+        A callable object of MaxPool3D.
     Raises:
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
         ValueError: If `padding` is "VALID", but `ceil_mode` is True.
         ShapeError: If the output's shape calculated is not greater than 0.
 
     Shape:
-        - x: 5-D tensor.
-        - out: 5-D tensor.
+        - x(Tensor): The input tensor of max pool3d operator, which is a 5-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of max pool3d  operator, which is a 5-D tensor.
+          The data type is same as input x.
 
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
 
-          # max pool3d
-          input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
-          MaxPool3D = nn.MaxPool3D(kernel_size=2,
+            # max pool3d
+            input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+            MaxPool3D = nn.MaxPool3D(kernel_size=2,
                                    stride=2, padding=0)
-          output = MaxPool3D(input)
-          # output.shape [1, 2, 3, 16, 16]
+            output = MaxPool3D(input)
+            # output.shape [1, 2, 3, 16, 16]
 
-          # for return_mask=True
-          MaxPool3D = nn.MaxPool3D(kernel_size=2, stride=2, padding=0, return_mask=True)
-          output, max_indices = MaxPool3D(input)
-          # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
+            # for return_mask=True
+            MaxPool3D = nn.MaxPool3D(kernel_size=2, stride=2, padding=0, return_mask=True)
+            output, max_indices = MaxPool3D(input)
+            # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
     """
 
     def __init__(self,
@@ -633,51 +646,52 @@ class AdaptiveAvgPool1D(layers.Layer):
 
     ..  math::
 
-       lstart &= floor(i * L_{in} / L_{out})
+        lstart &= floor(i * L_{in} / L_{out})
 
-       lend &= ceil((i + 1) * L_{in} / L_{out})
+        lend &= ceil((i + 1) * L_{in} / L_{out})
 
-       Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)}
+        Output(i) &= \frac{ \sum Input[lstart:lend]}{lend - lstart}
 
-    Args:
-        output_size (int): The target output size. It must be an integer.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+    Parameters:
+        output_size(int): The target output size. It must be an integer.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
 
     Returns:
-        None.
+        A callable object of AdaptiveAvgPool1D.
 
     Raises:
         ValueError: 'output_size' should be an integer.
 
     Shape:
-        - x: 3-D tensor.
-        - out: 3-D tensor.
+        - x(Tensor): 3-D tensor. The input tensor of adaptive avg pool1d operator, which is a 3-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): 3-D tensor. The output tensor of adaptive avg pool1d operator, which is a 3-D tensor.
+          The data type is same as input x.
 
     Examples:
         .. code-block:: python
 
-          # average adaptive pool1d
-          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-          # output shape is [N, C, m], adaptive pool divide L dimension
-          # of input data into m grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive max pool performs calculations as follow:
-          #
-          #     for i in range(m):
-          #         lstart = floor(i * L / m)
-          #         lend = ceil((i + 1) * L / m)
-          #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend)
-          #
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
-
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AdaptiveAvgPool1D = nn.AdaptiveAvgPool1D(output_size=16)
-          pool_out = AdaptiveAvgPool1D(data)
-          # pool_out shape: [1, 3, 16]
+            # average adaptive pool1d
+            # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+            # output shape is [N, C, m], adaptive pool divide L dimension
+            # of input data into m grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive max pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         lstart = floor(i * L / m)
+            #         lend = ceil((i + 1) * L / m)
+            #         output[:, :, i] = sum(input[:, :, lstart: lend])/(lend - lstart)
+            #
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+
+            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+            AdaptiveAvgPool1D = nn.AdaptiveAvgPool1D(output_size=16)
+            pool_out = AdaptiveAvgPool1D(data)
+            # pool_out shape: [1, 3, 16]
     """
 
     def __init__(self, output_size, name=None):
@@ -702,31 +716,32 @@ class AdaptiveAvgPool2D(layers.Layer):
 
     ..  math::
 
-       hstart &= floor(i * H_{in} / H_{out})
+        hstart &= floor(i * H_{in} / H_{out})
 
-       hend &= ceil((i + 1) * H_{in} / H_{out})
+        hend &= ceil((i + 1) * H_{in} / H_{out})
 
-       wstart &= floor(j * W_{in} / W_{out})
+        wstart &= floor(j * W_{in} / W_{out})
 
-       wend &= ceil((j + 1) * W_{in} / W_{out})
+        wend &= ceil((j + 1) * W_{in} / W_{out})
 
-       Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+        Output(i ,j) &= \frac{\sum Input[hstart:hend, wstart:wend]}{(hend - hstart) * (wend - wstart)}
 
 
     Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+        output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two element, (H, W). H and W can be either a int, or None which means
             the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
+        data_format(str, optional): The data format of the input and output data. An optional string
             from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
             the order of: [batch_size, input_channels, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
 
     Shape:
-        x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
-        output (Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type is same as input x.
+        - x(Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor.
+          The data type is same as input x.
 
     Returns:
         A callable object of AdaptiveAvgPool2D.
@@ -787,34 +802,36 @@ class AdaptiveAvgPool3D(layers.Layer):
 
     ..  math::
 
-      dstart &= floor(i * D_{in} / D_{out})
+        dstart &= floor(i * D_{in} / D_{out})
 
-      dend &= ceil((i + 1) * D_{in} / D_{out})
+        dend &= ceil((i + 1) * D_{in} / D_{out})
 
-      hstart &= floor(j * H_{in} / H_{out})
+        hstart &= floor(j * H_{in} / H_{out})
 
-      hend &= ceil((j + 1) * H_{in} / H_{out})
+        hend &= ceil((j + 1) * H_{in} / H_{out})
 
-      wstart &= floor(k * W_{in} / W_{out})
+        wstart &= floor(k * W_{in} / W_{out})
 
-      wend &= ceil((k + 1) * W_{in} / W_{out})
+        wend &= ceil((k + 1) * W_{in} / W_{out})
 
-      Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+        Output(i ,j, k) &= \frac{\sum Input[dstart:dend, hstart:hend, wstart:wend]}
+            {(dend - dstart) * (hend - hstart) * (wend - wstart)}
 
 
     Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+        output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
             the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
+        data_format(str, optional): The data format of the input and output data. An optional string
             from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
             the order of: [batch_size, input_channels, input_depth, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
     Shape:
-        x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
-        output (Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type is same as input x.
+        - x(Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
+          The data type can be float32, float64\.
+        - output(Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor.
+          The data type is same as input x.
 
     Returns:
         A callable object of AdaptiveAvgPool3D.
@@ -881,58 +898,59 @@ class AdaptiveMaxPool1D(layers.Layer):
 
     ..  math::
 
-       lstart &= floor(i * L_{in} / L_{out})
+        lstart &= floor(i * L_{in} / L_{out})
 
-       lend &= ceil((i + 1) * L_{in} / L_{out})
+        lend &= ceil((i + 1) * L_{in} / L_{out})
 
-       Output(i) &= max(Input[lstart:lend])
+        Output(i) &= max(Input[lstart:lend])
 
-    Args:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-             it must contain one int.
-        return_mask (bool): If true, the index of max pooling point will be returned along
+    Parameters:
+        output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain one int.
+        return_mask(bool, optional): If true, the index of max pooling point will be returned along
             with outputs. It cannot be set in average pooling type. Default False.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
     Returns:
-        None.
+        A callable object of AdaptiveMaxPool1D.
 
     Raises:
         ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
 
     Shape:
-        x (Tensor): The input tensor of adaptive max pool1d operator, which is a 3-D tensor. The data type can be float32, float64.
-        output (Tensor): The output tensor of adaptive max pool1d operator, which is a 3-D tensor. The data type is same as input x.
+        - x(Tensor): The input tensor of adaptive max pool1d operator, which is a 3-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of adaptive max pool1d operator, which is a 3-D tensor.
+          The data type is same as input x.
 
     Examples:
         .. code-block:: python
 
-          # max adaptive pool1d
-          # suppose input data in shape of [N, C, L], `output_size` is m or [m],
-          # output shape is [N, C, m], adaptive pool divide L dimension
-          # of input data into m grids averagely and performs poolings in each
-          # grid to get output.
-          # adaptive max pool performs calculations as follow:
-          #
-          #     for i in range(m):
-          #         lstart = floor(i * L / m)
-          #         lend = ceil((i + 1) * L / m)
-          #         output[:, :, i] = max(input[:, :, lstart: lend])
-          #
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
-
-          data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16)
-          pool_out = AdaptiveMaxPool1D(data)
-          # pool_out shape: [1, 3, 16]
-
-          # for return_mask = true
-          AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16, return_mask=True)
-          pool_out, indices = AdaptiveMaxPool1D(data)
-          # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
+            # max adaptive pool1d
+            # suppose input data in shape of [N, C, L], `output_size` is m or [m],
+            # output shape is [N, C, m], adaptive pool divide L dimension
+            # of input data into m grids averagely and performs poolings in each
+            # grid to get output.
+            # adaptive max pool performs calculations as follow:
+            #
+            #     for i in range(m):
+            #         lstart = floor(i * L / m)
+            #         lend = ceil((i + 1) * L / m)
+            #         output[:, :, i] = max(input[:, :, lstart: lend])
+            #
+            import paddle
+            import paddle.nn as nn
+            import numpy as np
+
+            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+            AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16)
+            pool_out = AdaptiveMaxPool1D(data)
+            # pool_out shape: [1, 3, 16]
+
+            # for return_mask = true
+            AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16, return_mask=True)
+            pool_out, indices = AdaptiveMaxPool1D(data)
+            # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
 
     """
 
@@ -954,31 +972,36 @@ def extra_repr(self):
 class AdaptiveMaxPool2D(layers.Layer):
     """
     This operation applies 2D adaptive max pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and
+    pooling is adaptive one focus on the output size.
 
     For adaptive max pool2d:
 
     ..  math::
 
-       hstart &= floor(i * H_{in} / H_{out})
+        hstart &= floor(i * H_{in} / H_{out})
 
-       hend &= ceil((i + 1) * H_{in} / H_{out})
+        hend &= ceil((i + 1) * H_{in} / H_{out})
 
-       wstart &= floor(j * W_{in} / W_{out})
+        wstart &= floor(j * W_{in} / W_{out})
 
-       wend &= ceil((j + 1) * W_{in} / W_{out})
+        wend &= ceil((j + 1) * W_{in} / W_{out})
 
-       Output(i ,j) &= max(Input[hstart:hend, wstart:wend])
+        Output(i ,j) &= max(Input[hstart:hend, wstart:wend])
 
     Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two element, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
-        return_mask (bool): If true, the index of max pooling point will be returned along with outputs. It cannot be set in average pooling type. Default False.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain
+            two element, (H, W). H and W can be either a int, or None which means the size will be the same as that of
+            the input.
+        return_mask(bool, optional): If true, the index of max pooling point will be returned along with outputs.
+            It cannot be set in average pooling type. Default False.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
     Shape:
-        x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float32, float64.
-        output (Tensor): The output tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type is same as input x.
+        - x(Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of adaptive max pool2d operator, which is a 4-D tensor.
+          The data type is same as input x.
 
     Returns:
         A callable object of AdaptiveMaxPool2D.
@@ -1029,36 +1052,42 @@ def extra_repr(self):
 
 class AdaptiveMaxPool3D(layers.Layer):
     """
-    This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions
-    of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
+    This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions of the output tensor are
+    determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus
+    on the output size.
 
     For adaptive max pool3d:
 
     ..  math::
 
-      dstart &= floor(i * D_{in} / D_{out})
+        dstart &= floor(i * D_{in} / D_{out})
 
-      dend &= ceil((i + 1) * D_{in} / D_{out})
+        dend &= ceil((i + 1) * D_{in} / D_{out})
 
-      hstart &= floor(j * H_{in} / H_{out})
+        hstart &= floor(j * H_{in} / H_{out})
 
-      hend &= ceil((j + 1) * H_{in} / H_{out})
+        hend &= ceil((j + 1) * H_{in} / H_{out})
 
-      wstart &= floor(k * W_{in} / W_{out})
+        wstart &= floor(k * W_{in} / W_{out})
 
-      wend &= ceil((k + 1) * W_{in} / W_{out})
+        wend &= ceil((k + 1) * W_{in} / W_{out})
 
-      Output(i ,j, k) &= max(Input[dstart:dend, hstart:hend, wstart:wend])
+        Output(i ,j, k) &= max(Input[dstart:dend, hstart:hend, wstart:wend])
 
     Parameters:
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
-        return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain
+            three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as
+            that of the input.
+        return_mask(bool, optional): If true, the index of max pooling point will be returned along with outputs.
+            Default False.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
     Shape:
-        x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
-        output (Tensor): The output tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type is same as input x.
+        - x(Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor.
+          The data type can be float32, float64.
+        - output(Tensor): The output tensor of adaptive max pool3d operator, which is a 5-D tensor.
+          The data type is same as input x.
+
     Returns:
         A callable object of AdaptiveMaxPool3D.
     Examples:
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
old mode 100644
new mode 100755
index 0806d2c29148f7..f917b4fa09a507
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -106,6 +106,7 @@ def fc(x,
         weight_attr (ParamAttr, optional): The attribute for the learnable weight.
             The default value is None, and the weight will be initialized to zero.
             For detailed information, please refer to :attr:`paddle.ParamAttr`.
+            Warning, if x is a list of tensor, weight_attr should also be a list of same length.
         bias_attr (ParamAttr|bool, optional): The attribute of the learnable bias. 
             If it is set to False, no bias will be added to the output.
             If it is set to None or one kind of ParamAttr, a bias parameter will
diff --git a/python/requirements.txt b/python/requirements.txt
index e89b3ede94fd4a..609a4b34e8f1ae 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -7,5 +7,5 @@ gast>=0.3.3 ; platform_system != "Windows"
 gast==0.3.3 ; platform_system == "Windows"
 Pillow
 six
-decorator
+decorator==4.4.2
 astor
diff --git a/python/setup.py.in b/python/setup.py.in
index 73c773bab494d0..2883f2ed248677 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -149,6 +149,7 @@ packages=['paddle',
           'paddle.distributed.fleet.base',
           'paddle.distributed.fleet.meta_optimizers',
           'paddle.distributed.fleet.meta_optimizers.sharding',
+          'paddle.distributed.fleet.meta_optimizers.ascend',
           'paddle.distributed.fleet.runtime',
           'paddle.distributed.fleet.dataset',
           'paddle.distributed.fleet.data_generator',
@@ -216,6 +217,7 @@ packages=['paddle',
           'paddle.static.amp',
           'paddle.tensor',
           'paddle.onnx',
+          'paddle.autograd',
           ]
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
diff --git a/scripts/paddle b/scripts/paddle
new file mode 100644
index 00000000000000..5f256ccf157910
--- /dev/null
+++ b/scripts/paddle
@@ -0,0 +1,169 @@
+#!/bin/bash
+
+function version(){
+        echo "PaddlePaddle , compiled with"
+        echo "    with_avx: ON"
+        echo "    with_gpu: OFF"
+        echo "    with_mkl: ON"
+        echo "    with_mkldnn: "
+        echo "    with_python: ON"
+}
+
+function ver2num() {
+  set -e
+  # convert version to number.
+  if [ -z "$1" ]; then # empty argument
+    printf "%03d%03d%03d%03d%03d" 0
+  else
+    local VERN=$(echo $1 | sed 's#v##g' | sed 's#\.# #g' \
+        | sed 's#a# 0 #g' | sed 's#b# 1 #g' | sed 's#rc# 2 #g')
+    if [ `echo $VERN | wc -w` -eq 3 ] ; then
+      printf "%03d%03d%03d%03d%03d" $VERN 999 999
+    else
+      printf "%03d%03d%03d%03d%03d" $VERN
+    fi
+  fi
+  set +e
+}
+
+function cpu_config() {
+  # auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status
+  # only when MKL enabled
+  if [ "ON" == "OFF" ]; then
+    return 0
+  fi
+  platform="`uname -s`"
+  ht=0
+  if [ $platform == "Linux" ]; then
+    ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+  elif [ $platform == "Darwin" ]; then
+    if [ `sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu` ]; then
+      # HT is OFF
+      ht=1
+    fi
+  else
+    return 0
+  fi
+  if [ $ht -eq 1 ]; then # HT is OFF
+    if [ -z "$KMP_AFFINITY" ]; then
+      export KMP_AFFINITY="granularity=fine,compact,0,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+      export OMP_DYNAMIC="FALSE"
+    fi
+  else # HT is ON
+    if [ -z "$KMP_AFFINITY" ]; then
+      export KMP_AFFINITY="granularity=fine,compact,1,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+      export OMP_DYNAMIC="True"
+    fi
+  fi
+}
+
+function threads_config() {
+  # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
+  # according to trainer_count and total processors
+  # only when MKL enabled
+  # auto set OPENBLAS_NUM_THREADS when do not use MKL
+  platform="`uname -s`"
+  processors=0
+  if [ $platform == "Linux" ]; then
+    processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
+  elif [ $platform == "Darwin" ]; then
+    processors=`sysctl -n hw.logicalcpu`
+  else
+    return 0
+  fi
+  trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
+  if [ -z $trainers ]; then
+    trainers=1
+  fi
+  threads=$((processors / trainers))
+  if [ $threads -eq 0 ]; then
+    threads=1
+  fi
+  if [ "ON" == "ON" ]; then
+    if [ -z "$OMP_NUM_THREADS" ]; then
+      export OMP_NUM_THREADS=$threads
+    fi
+    if [ -z "$MKL_NUM_THREADS" ]; then
+      export MKL_NUM_THREADS=$threads
+    fi
+  else
+    if [ -z "$OPENBLAS_NUM_THREADS" ]; then
+      export OPENBLAS_NUM_THREADS=$threads
+    fi
+    if [ $threads -gt 1 ] && [ -z "$OPENBLAS_MAIN_FREE" ]; then
+      export OPENBLAS_MAIN_FREE=1
+    fi
+  fi
+  
+}
+
+PADDLE_CONF_HOME="$HOME/.config/paddle"
+mkdir -p ${PADDLE_CONF_HOME}
+
+if [ -z "${PADDLE_NO_STAT+x}" ]; then
+    SERVER_VER=`curl -m 5 -X POST --data content="{ \"version\": \"\" }"\
+        -b ${PADDLE_CONF_HOME}/paddle.cookie \
+        -c ${PADDLE_CONF_HOME}/paddle.cookie \
+        http://api.paddlepaddle.org/version 2>/dev/null`
+    if [ $? -eq 0 ] && [ "$(ver2num )" -lt  $(ver2num $SERVER_VER) ]; then
+      echo "Paddle release a new version ${SERVER_VER}, you can get the install package in http://www.paddlepaddle.org"
+    fi
+fi
+
+PADDLE_BIN_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+if [ ! -z "${DEBUGGER}" ]; then
+    echo "Using debug command ${DEBUGGER}"
+fi
+
+CUDNN_LIB_PATH=""
+
+if [ ! -z "${CUDNN_LIB_PATH}" ]; then
+    export LD_LIBRARY_PATH=${CUDNN_LIB_PATH}:${LD_LIBRARY_PATH}
+fi
+
+export PYTHONPATH=${PWD}:${PYTHONPATH}
+
+
+# Check python lib installed or not.
+pip --help > /dev/null
+if [ $? -ne 0 ]; then
+    echo "pip should be installed to run paddle."
+    exit 1
+fi
+
+if [ "OFF" == "ON" ]; then
+    PADDLE_NAME="paddlepaddle-gpu"
+else 
+    PADDLE_NAME="paddlepaddle"
+fi
+
+INSTALLED_VERSION=`pip freeze 2>/dev/null | grep "^${PADDLE_NAME}==" | sed 's/.*==//g'`
+
+if [ -z "${INSTALLED_VERSION}" ]; then
+   INSTALLED_VERSION="0.0.0"  # not installed
+fi
+cat <<EOF | python -
+from distutils.version import LooseVersion
+import sys
+if LooseVersion("${INSTALLED_VERSION}") < LooseVersion(""):
+  sys.exit(1)
+else:
+  sys.exit(0)
+EOF
+
+cpu_config
+# echo $KMP_AFFINITY $OMP_DYNAMIC
+
+case "$1" in
+    "version")
+        version
+        ;;
+    *)
+        version
+        ;;
+ esac
diff --git a/tools/check_added_ut.sh b/tools/check_added_ut.sh
index 2ea34771d1b38c..618236f75bfae9 100644
--- a/tools/check_added_ut.sh
+++ b/tools/check_added_ut.sh
@@ -16,32 +16,66 @@
 
 set +e
 set -x
+SYSTEM=`uname -s`
 if [ -z ${BRANCH} ]; then
     BRANCH="develop"
 fi
 
 export CI_SKIP_CPP_TEST=OFF
-PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
+if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
+    PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
+elif [[ "$SYSTEM" == "Windows_NT" ]];then
+    PADDLE_ROOT="$(cd "$PWD/../" && pwd )"
+fi
 CURDIR=`pwd`
 cd $PADDLE_ROOT
-cp $PADDLE_ROOT/paddle/scripts/paddle_build.sh $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
+if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
+    cp $PADDLE_ROOT/paddle/scripts/paddle_build.sh $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
+elif [[ "$SYSTEM" == "Windows_NT" ]];then
+    git remote | grep upstream
+    if [ $? != 0 ]; then 
+        git remote add upstream https://github.com/PaddlePaddle/Paddle.git
+        git fetch upstream develop
+    fi
+fi
 CURBRANCH=`git rev-parse --abbrev-ref HEAD`
 echo $CURBRANCH
+if [ `git branch | grep 'prec_added_ut'` ];then
+    git branch -D 'prec_added_ut'
+fi
 git checkout -b prec_added_ut upstream/${BRANCH}
+git branch
 mkdir prec_build
 cd prec_build
-bash $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh cmake_gen_in_current_dir >prebuild.log 2>&1
-ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' > /$PADDLE_ROOT/br-ut
+if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
+    bash $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh cmake_gen_in_current_dir >prebuild.log 2>&1
+elif [[ "$SYSTEM" == "Windows_NT" ]];then
+    bash $PADDLE_ROOT/win_cmake.sh
+fi
+ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/br-ut
 cd $PADDLE_ROOT/build
-ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' > /$PADDLE_ROOT/pr-ut
+ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/pr-ut
 cd $PADDLE_ROOT
+echo "================================="
+echo "br-ut"
+cat $PADDLE_ROOT/br-ut
+echo "================================="
+echo "pr-ut"
+cat $PADDLE_ROOT/pr-ut
+echo "================================="
 grep -F -x -v -f br-ut pr-ut > $PADDLE_ROOT/added_ut
-sort pr-ut |uniq -d > $PADDLE_ROOT/duplicate_ut
+if [[ "$SYSTEM" == 'Linux' ]];then
+    sort pr-ut |uniq -d > $PADDLE_ROOT/duplicate_ut
+fi
 echo "New-UT:"
 cat $PADDLE_ROOT/added_ut
 rm -rf prec_build
-rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
-git checkout $CURBRANCH
+if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then
+    rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh
+elif [[ "$SYSTEM" == "Windows_NT" ]];then
+    rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/get_added_ut.sh
+fi
+git checkout -f $CURBRANCH
 echo $CURBRANCH
 git branch -D prec_added_ut
 cd $CURDIR
diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh
index 4e8ea25715451f..eb05468eda6cad 100644
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 if [ -z ${BRANCH} ]; then
     BRANCH="develop"
 fi
diff --git a/tools/check_sequence_op.sh b/tools/check_sequence_op.sh
index ada96750eaad80..a263b046b258b1 100644
--- a/tools/check_sequence_op.sh
+++ b/tools/check_sequence_op.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
 
 function check_sequnece_op_unitests(){
diff --git a/tools/cudaError/start.sh b/tools/cudaError/start.sh
index 3c0e57ffe7ec1f..66e56b8485d8c6 100644
--- a/tools/cudaError/start.sh
+++ b/tools/cudaError/start.sh
@@ -1,4 +1,19 @@
 #!/usr/bin/env bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -ex
 SYSTEM=`uname -s`
 rm -f protoc-3.11.3-linux-x86_64.*
diff --git a/tools/diff_api.py b/tools/diff_api.py
index 8a2acbb3d0acc7..f086598945afe4 100644
--- a/tools/diff_api.py
+++ b/tools/diff_api.py
@@ -1,4 +1,19 @@
 #!/usr/bin/env python
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import print_function
 import difflib
 import sys
diff --git a/tools/diff_unittest.py b/tools/diff_unittest.py
index 382fbdd0b0c29f..fa70be0990ec09 100644
--- a/tools/diff_unittest.py
+++ b/tools/diff_unittest.py
@@ -1,4 +1,19 @@
 #!/usr/bin/env python
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import difflib
 import sys
 
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index d68992717c5126..f566e66a97662e 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -97,8 +97,8 @@ RUN wget https://files.pythonhosted.org/packages/b0/d1/8acb42f391cba52e35b131e44
 WORKDIR /home/setuptools-40.6.2
 RUN python setup.py build && python setup.py install
 WORKDIR /home
-RUN wget https://files.pythonhosted.org/packages/69/81/52b68d0a4de760a2f1979b0931ba7889202f302072cc7a0d614211bc7579/pip-18.0.tar.gz && tar -zxvf pip-18.0.tar.gz
-WORKDIR pip-18.0
+RUN wget https://files.pythonhosted.org/packages/28/af/2c76c8aa46ccdf7578b83d97a11a2d1858794d4be4a1610ade0d30182e8b/pip-20.0.1.tar.gz && tar -zxvf pip-20.0.1.tar.gz
+WORKDIR pip-20.0.1
 RUN python setup.py install && \
   python3.8 setup.py install && \
   python3.7 setup.py install && \
@@ -106,8 +106,8 @@ RUN python setup.py install && \
   python3 setup.py install 
 
 WORKDIR /home
-RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-18.0.tar.gz && \
-    rm -r Python-$version setuptools-40.6.2 pip-18.0
+RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-20.0.1.tar.gz && \
+    rm -r Python-$version setuptools-40.6.2 pip-20.0.1
 
 # Install Go and glide
 RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \
diff --git a/tools/dockerfile/build_scripts/install_gcc.sh b/tools/dockerfile/build_scripts/install_gcc.sh
index e75021b2a9b653..e744e9ddac66e6 100644
--- a/tools/dockerfile/build_scripts/install_gcc.sh
+++ b/tools/dockerfile/build_scripts/install_gcc.sh
@@ -43,4 +43,18 @@ if [ "$1" == "gcc82" ]; then
   ln -s /usr/local/gcc-8.2/lib64/libgfortran.so.5 ${lib_so_5} && \
   ln -s /usr/local/gcc-8.2/lib64/libstdc++.so.6 ${lib_so_6} && \
   cp /usr/local/gcc-8.2/lib64/libstdc++.so.6.0.25 ${lib_path}
+elif [ "$1" == "gcc54" ]; then
+  wget -q http://ftp.tsukuba.wide.ad.jp/software/gcc/releases/gcc-5.4.0/gcc-5.4.0.tar.bz2 
+  tar -xvf gcc-5.4.0.tar.bz2 && \
+  cd gcc-5.4.0 && \
+  unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \
+  ./contrib/download_prerequisites && \
+  cd .. && mkdir temp_gcc54 && cd temp_gcc54 && \
+  ../gcc-5.4.0/configure --prefix=/usr/local/gcc-5.4 --enable-checking=release --enable-languages=c,c++ --disable-multilib && \
+  make -j8 && make install
+  cd .. && rm -rf temp_gcc54
+  cp ${lib_so_6} ${lib_so_6}.bak  && rm -f ${lib_so_6} && 
+  ln -s /usr/local/gcc-5.4/lib64/libgfortran.so.5 ${lib_so_5} && \
+  ln -s /usr/local/gcc-5.4/lib64/libstdc++.so.6 ${lib_so_6} && \
+  cp /usr/local/gcc-5.4/lib64/libstdc++.so.6.0.21 ${lib_path}
 fi
diff --git a/tools/dockerfile/centos7_manylinux.sh b/tools/dockerfile/centos7_manylinux.sh
index 490bff22826826..5f8a48c8067a5b 100755
--- a/tools/dockerfile/centos7_manylinux.sh
+++ b/tools/dockerfile/centos7_manylinux.sh
@@ -20,11 +20,15 @@ REPO="${REPO:-paddledocker}"
 
 function make_cuda9cudnn7(){
   sed 's/<baseimg>/9.0-cudnn7-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp 
+
 }
 
 
 function make_cuda10cudnn7() {
   sed 's/<baseimg>/10.0-cudnn7-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp 
+
 }
 
 
diff --git a/tools/dockerfile/icode.sh b/tools/dockerfile/icode.sh
index da3ffb8c77db71..973975fe7f7373 100755
--- a/tools/dockerfile/icode.sh
+++ b/tools/dockerfile/icode.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 
 function install_gcc(){
   sed -i 's#<install_gcc>#RUN apt-get update \
diff --git a/tools/dockerfile/ubuntu16_dev.sh b/tools/dockerfile/ubuntu16_dev.sh
index b7d0d8e3e2aac7..23578b4143f8b1 100755
--- a/tools/dockerfile/ubuntu16_dev.sh
+++ b/tools/dockerfile/ubuntu16_dev.sh
@@ -33,7 +33,7 @@ function ref_whl(){
   fi
 
   if [[ ${WITH_GPU} != "ON" ]]; then
-    ref_gcc = ""
+    ref_gcc=""
   elif [[ ${gcc_version} == "8.2.0" ]];then
     ref_gcc=_gcc8.2
   fi
@@ -44,29 +44,31 @@ function ref_whl(){
       ref_version=.post100
   elif [[ ${ref_CUDA_MAJOR} == "10.1" ]];then
       ref_version=.post101
-  elif [[ ${ref_CUDA_MAJOR} == "10.2" ]];then
+  elif [[ ${ref_CUDA_MAJOR} == "10.2" && ${PADDLE_VERSION} == "develop" ]];then
+      ref_version=.post102
+  elif [[ ${ref_CUDA_MAJOR} == "10.2" && ${PADDLE_VERSION} != "develop" ]];then
       ref_version=""
   elif [[ ${ref_CUDA_MAJOR} == "9" ]];then
       ref_version=.post90
   fi
+
+  ref_dev=2.1.0.dev0
   
   ref_web="https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}-${ref_gpu}-${ref_mkl}${ref_gcc}"
   
-  if [[ ${PADDLE_VERSION} == "0.0.0" && ${WITH_GPU} == "ON" ]]; then
-    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl
-    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl
-  else
-    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl
-    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl
-  fi
-  
-  if [[ ${PADDLE_VERSION} != "0.0.0" && ${WITH_GPU} == "ON" ]]; then
+  if [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} == "ON" ]]; then
+    ref_paddle_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp38-cp38-linux_x86_64.whl
+  elif [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} != "ON" ]]; then
+    ref_paddle_whl=paddlepaddle${install_gpu}-${ref_dev}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${ref_dev}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${ref_dev}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${ref_dev}-cp38-cp38-linux_x86_64.whl
+  elif [[ ${PADDLE_VERSION} != "develop" && ${WITH_GPU} == "ON" ]]; then
     ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp36-cp36m-linux_x86_64.whl
@@ -106,7 +108,7 @@ function install_gcc(){
   else
     sed -i 's#<install_gcc>#RUN apt-get update \
       WORKDIR /usr/bin \
-      RUN apt install -y gcc-4.8 g++-4.8 \&\& cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \&\& ln -s gcc-4.8 gcc \&\& ln -s g++-4.8 g++ #g' Dockerfile.tmp
+      RUN apt install -y gcc g++ #g' Dockerfile.tmp
   fi
 }
 
diff --git a/tools/dockerfile/ubuntu18_dev.sh b/tools/dockerfile/ubuntu18_dev.sh
index 19572f639bcf5f..6c6a14529ca0e1 100755
--- a/tools/dockerfile/ubuntu18_dev.sh
+++ b/tools/dockerfile/ubuntu18_dev.sh
@@ -33,7 +33,7 @@ function ref_whl(){
   fi
 
   if [[ ${WITH_GPU} != "ON" ]]; then
-    ref_gcc = ""
+    ref_gcc=""
   elif [[ ${gcc_version} == "8.2.0" ]];then
     ref_gcc=_gcc8.2
   fi
@@ -44,29 +44,31 @@ function ref_whl(){
       ref_version=.post100
   elif [[ ${ref_CUDA_MAJOR} == "10.1" ]];then
       ref_version=.post101
-  elif [[ ${ref_CUDA_MAJOR} == "10.2" ]];then
+  elif [[ ${ref_CUDA_MAJOR} == "10.2" && ${PADDLE_VERSION} == "develop" ]];then
+      ref_version=.post102
+  elif [[ ${ref_CUDA_MAJOR} == "10.2" && ${PADDLE_VERSION} != "develop" ]];then
       ref_version=""
   elif [[ ${ref_CUDA_MAJOR} == "9" ]];then
       ref_version=.post90
   fi
+
+  ref_dev=2.1.0.dev0
   
   ref_web="https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}-${ref_gpu}-${ref_mkl}${ref_gcc}"
   
-  if [[ ${PADDLE_VERSION} == "0.0.0" && ${WITH_GPU} == "ON" ]]; then
-    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl
-    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl
-  else
-    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl
-    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl
-  fi
-  
-  if [[ ${PADDLE_VERSION} != "0.0.0" && ${WITH_GPU} == "ON" ]]; then
+  if [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} == "ON" ]]; then
+    ref_paddle_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp38-cp38-linux_x86_64.whl
+  elif [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} != "ON" ]]; then
+    ref_paddle_whl=paddlepaddle${install_gpu}-${ref_dev}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${ref_dev}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${ref_dev}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${ref_dev}-cp38-cp38-linux_x86_64.whl
+  elif [[ ${PADDLE_VERSION} != "develop" && ${WITH_GPU} == "ON" ]]; then
     ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp36-cp36m-linux_x86_64.whl
@@ -107,7 +109,7 @@ function install_gcc(){
   else
     sed -i 's#<install_gcc>#RUN apt-get update \
       WORKDIR /usr/bin \
-      RUN apt install -y gcc-4.8 g++-4.8 \&\& cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \&\& ln -s gcc-4.8 gcc \&\& ln -s g++-4.8 g++ #g' Dockerfile.tmp
+      RUN apt install -y gcc g++ #g' Dockerfile.tmp
   fi
 }
 
diff --git a/tools/document_preview.sh b/tools/document_preview.sh
index 10f486f8fd4f63..83c758d0aa8b8f 100755
--- a/tools/document_preview.sh
+++ b/tools/document_preview.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 PADDLE_ROOT=/home
 mkdir ${PADDLE_ROOT}
 cd ${PADDLE_ROOT}
diff --git a/tools/get_cpu_info.sh b/tools/get_cpu_info.sh
index 81eb19dc0661e6..bce338a8619e64 100755
--- a/tools/get_cpu_info.sh
+++ b/tools/get_cpu_info.sh
@@ -1,5 +1,19 @@
 #!/bin/bash
 
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 if [ "`uname -s`" != "Linux" ]; then
   echo "Current scenario only support in Linux yet!"
   exit 0
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 3fb78b0d0a19ac..0f745f212078fc 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -16,166 +16,148 @@
 import os
 
 # *=======These unittest doesn't occupy GPU memory, just run as CPU unittest=======* #
-# It run 8 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
+# It run 16 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
 # just remove it from this list.
 CPU_PARALLEL_JOB = [
-    'test_row_conv',
-    'test_nce',
-    'test_conv3d_mkldnn_op',
-    'dim_test',
-    'test_limit_gpu_memory',
-    'profiler_test',
-    'test_dequantize_mkldnn_op',
-    'test_elementwise_add_bf16_mkldnn_op',
-    'test_rpn_target_assign_op',
-    'test_hash_op',
-    'reader_blocking_queue_test',
-    'jit_kernel_test',
-    'test_tdm_child_op',
-    'test_simplify_with_basic_ops_pass',
-    'test_sequence_last_step',
-    'test_sequence_first_step',
-    'test_seq_concat_fc_fuse_pass',
-    'test_fc_gru_fuse_pass',
-    'test_dataset_imdb',
-    'dlpack_tensor_test',
-    'check_reduce_rank_test',
+    'test_static_save_load_large',
+    'version_test',
     'var_type_traits_test',
     'var_type_inference_test',
+    'variable_test',
+    'unroll_array_ops_test',
+    'tuple_test',
     'to_string_test',
+    'timer_test',
     'threadpool_test',
+    'test_zeros_op',
+    'test_while_op',
+    'test_weight_quantization_mobilenetv1',
     'test_version',
     'test_var_info',
     'test_var_conv_2d',
+    'test_utils',
     'test_unique_name',
     'test_transpose_int8_mkldnn_op',
     'test_transpose_bf16_mkldnn_op',
+    'test_trainer_desc',
     'test_trainable',
     'test_teacher_student_sigmoid_loss_op',
     'test_tdm_sampler_op',
+    'test_tdm_child_op',
+    'test_sysconfig',
+    'test_sync_batch_norm_pass',
     'test_switch',
     'test_static_shape_inferrence_for_shape_tensor',
-    'test_squared_mat_sub_fuse_pass',
-    'test_sequence_scatter_op',
-    'test_sequence_scatter_op',
-    'test_scaled_dot_product_attention',
-    'test_rnn_memory_helper_op',
-    'test_requantize_mkldnn_op',
-    'test_quantize_transpiler',
-    'test_quantize_mkldnn_op',
-    'test_py_reader_sample_generator',
-    'test_parallel_executor_seresnext_with_reduce_cpu',
-    'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu',
-    'test_parallel_executor_seresnext_base_cpu',
-    'test_parallel_dygraph_sync_batch_norm',
-    'test_origin_info',
-    'test_multiclass_nms_op',
-    'test_mkldnn_conv_bias_fuse_pass',
-    'test_mkldnn_conv_activation_fuse_pass',
-    'test_matrix_nms_op',
-    'test_ir_graph',
-    'test_inference_api',
-    'test_infer_shape',
-    'test_infer_no_need_buffer_slots',
-    'test_imperative_numpy_bridge',
-    'test_imperative_decorator',
-    'test_hooks',
-    'test_gpu_package_without_gpu_device',
-    'test_global_var_getter_setter',
-    'test_get_set_flags',
-    'test_fusion_repeated_fc_relu_op',
-    'test_fused_emb_seq_pool_op',
-    'test_fleet_base_4',
-    'test_fc_lstm_fuse_pass',
-    'test_executor_feed_non_tensor',
-    'test_executor_check_feed',
-    'test_executor_and_use_program_cache',
-    'test_exception',
-    'test_error_clip',
-    'test_embedding_eltwise_layernorm_fuse_pass',
-    'test_dyn_rnn',
-    'test_dpsgd_op',
-    'test_distributed_reader',
-    'test_directory_migration',
-    'test_dataset_wmt',
-    'test_dataset_uci_housing',
-    'test_dataset_cifar',
-    'test_data_feeder',
-    'test_cudnn_placement_pass',
-    'test_conv3d_layer',
-    'test_concat_bf16_mkldnn_op',
-    'test_common_infer_shape_functions',
-    'test_check_import_scipy',
-    'test_calc_gradient',
-    'test_bipartite_match_op',
-    'test_attention_lstm_op',
-    'test_array_read_write_op',
-    'stringprintf_test',
-    'stringpiece_test',
-    'selected_rows_test',
-    'scope_test',
-    'reader_test',
-    'prune_test',
-    'op_tester',
-    'eigen_test',
-    'device_worker_test',
-    'cudnn_helper_test',
-    'cudnn_desc_test',
-    'tuple_test',
-    'timer_test',
-    'test_zeros_op',
-    'test_while_op',
-    'test_utils',
     'test_static_analysis',
+    'test_squared_mat_sub_fuse_pass',
     'test_split_and_merge_lod_tensor_op',
     'test_spawn_and_init_parallel_env',
     'test_slice_var',
+    'test_skip_layernorm_fuse_pass',
+    'test_simplify_with_basic_ops_pass',
     'test_similarity_focus_op',
     'test_shuffle_batch_op',
     'test_shrink_rnn_memory',
     'test_set_bool_attr',
     'test_sequence_topk_avg_pooling',
+    'test_sequence_scatter_op',
+    'test_sequence_scatter_op',
+    'test_sequence_last_step',
+    'test_sequence_first_step',
+    'test_seqpool_cvm_concat_fuse_pass',
+    'test_seqpool_concat_fuse_pass',
+    'test_seq_concat_fc_fuse_pass',
     'test_selected_rows',
     'test_scope',
+    'test_scale_matmul_fuse_pass',
+    'test_scaled_dot_product_attention',
     'test_sampling_id_op',
     'test_runtime_and_compiletime_exception',
     'test_run_fluid_by_module_or_command_line',
+    'test_rpn_target_assign_op',
+    'test_row_conv',
+    'test_rnn_memory_helper_op',
     'test_retinanet_detection_output',
+    'test_reshape_transpose_matmul_mkldnn_fuse_pass',
+    'test_reshape_bf16_op',
     'test_require_version',
+    'test_requantize_mkldnn_op',
+    'test_repeated_fc_relu_fuse_pass',
     'test_repeated_fc_relu_fuse_pass',
     'test_registry',
+    'test_reducescatter_api',
+    'test_reducescatter',
     'test_recurrent_op',
     'test_recommender_system',
     'test_query_op',
+    'test_quantize_transpiler',
+    'test_quantize_mkldnn_op',
     'test_quantization_mkldnn_pass',
+    'test_quant_int8_resnet50_mkldnn',
+    'test_quant_int8_mobilenetv2_mkldnn',
+    'test_quant_int8_mobilenetv1_mkldnn',
+    'test_quant_int8_googlenet_mkldnn',
+    'test_quant2_int8_resnet50_range_mkldnn',
+    'test_quant2_int8_resnet50_mkldnn',
+    'test_quant2_int8_resnet50_channelwise_mkldnn',
+    'test_quant2_int8_mobilenetv1_mkldnn',
     'test_quant2_int8_mkldnn_pass',
-    'test_pybind_interface',
+    'test_quant2_int8_ernie_mkldnn',
+    'test_py_reader_sample_generator',
+    'test_py_reader_return_list',
+    'test_py_reader_lod_level_share',
     'test_py_reader_error_msg',
+    'test_pyramid_hash_op',
+    'test_pybind_interface',
+    'test_ps_dispatcher',
     'test_prune',
+    'test_protobuf_descs',
     'test_protobuf',
     'test_progressbar',
     'test_program_to_string',
     'test_program_code',
     'test_program',
     'test_precision_recall_op',
+    'test_post_training_quantization_resnet50',
+    'test_post_training_quantization_mobilenetv1',
+    'test_post_training_quantization_mnist',
     'test_positive_negative_pair_op',
-    'test_parallel_executor_run_load_infer_program',
+    'test_paddle_inference_api',
+    'test_origin_info',
     'test_op_version',
     'test_op_support_gpu',
+    'test_operator_desc',
+    'test_operator',
     'test_ones_op',
     'test_npair_loss_op',
     'test_nn_functional_embedding_static',
+    'test_nce',
     'test_name_scope',
+    'test_naive_executor',
     'test_multiprocess_dataloader_iterable_dataset_split',
+    'test_multiprocess_dataloader_exception',
+    'test_multihead_matmul_fuse_pass',
+    'test_multi_gru_seq_fuse_pass',
     'test_multi_gru_mkldnn_op',
+    'test_multi_gru_fuse_pass',
+    'test_multiclass_nms_op',
     'test_mul_int8_mkldnn_op',
     'test_mkldnn_scale_matmul_fuse_pass',
+    'test_mkldnn_placement_pass',
+    'test_mkldnn_op_nhwc',
     'test_mkldnn_op_inplace',
     'test_mkldnn_matmul_transpose_reshape_fuse_pass',
+    'test_mkldnn_matmul_op_output_fuse_pass',
+    'test_mkldnn_inplace_pass',
     'test_mkldnn_inplace_fuse_pass',
     'test_mkldnn_cpu_bfloat16_pass',
+    'test_mkldnn_conv_concat_relu_mkldnn_fuse_pass',
+    'test_mkldnn_conv_bias_fuse_pass',
+    'test_mkldnn_conv_activation_fuse_pass',
     'test_mine_hard_examples_op',
     'test_memory_usage',
+    'test_matrix_nms_op',
+    'test_matmul_transpose_reshape_fuse_pass',
     'test_matmul_mkldnn_op',
     'test_matmul_bf16_mkldnn_op',
     'test_math_op_patch',
@@ -186,53 +168,100 @@
     'test_lod_tensor_array_ops',
     'test_lod_tensor_array',
     'test_lod_rank_table',
-    'test_lod_array_length_op',
     'test_locality_aware_nms_op',
     'test_load_vars_shape_check',
     'test_load_op_xpu',
     'test_load_op',
-    'test_linear_chain_crf_op',
+    'test_limit_gpu_memory',
     'test_layer_norm_mkldnn_op',
     'test_layer_norm_bf16_mkldnn_op',
+    'test_layer',
     'test_lambv2_op',
+    'test_is_test_pass',
     'test_ir_skip_layernorm_pass',
+    'test_ir_graph',
     'test_io_save_load',
     'test_input_spec',
+    'test_infer_shape',
+    'test_infer_no_need_buffer_slots',
     'test_inference_model_io',
+    'test_inference_api',
+    'test_imperative_signal_handler',
+    'test_imperative_numpy_bridge',
+    'test_imperative_group',
+    'test_imperative_decorator',
+    'test_imperative_data_loader_process',
+    'test_imperative_data_loader_exit_func',
     'test_imperative_base',
     'test_image_classification_layer',
     'test_image',
     'test_ifelse_basic',
     'test_hsigmoid_op',
+    'test_hooks',
+    'test_hash_op',
+    'test_group',
+    'test_graph_pattern_detector',
+    'test_gpu_package_without_gpu_device',
+    'test_global_var_getter_setter',
+    'test_get_set_flags',
     'test_generator',
     'test_generate_proposal_labels_op',
     'test_generate_mask_labels_op',
     'test_gast_with_compatibility',
     'test_fusion_squared_mat_sub_op',
+    'test_fusion_seqpool_cvm_concat_op',
+    'test_fusion_seqpool_concat_op',
+    'test_fusion_seqexpand_concat_fc_op',
     'test_fusion_seqconv_eltadd_relu_op',
+    'test_fusion_repeated_fc_relu_op',
     'test_fusion_lstm_op',
     'test_fusion_gru_op',
+    'test_fusion_gru_mkldnn_op',
     'test_fusion_gru_int8_mkldnn_op',
     'test_fusion_gru_bf16_mkldnn_op',
+    'test_fused_emb_seq_pool_op',
     'test_fused_embedding_fc_lstm_op',
     'test_function_spec',
     'test_full_op',
+    'test_fs_interface',
+    'test_fs',
     'test_framework_debug_str',
     'test_fp16_utils',
+    'test_fleet_util',
+    'test_fleet_unitaccessor',
+    'test_fleet_runtime',
+    'test_fleet_rolemaker_init',
     'test_bf16_utils',
     'test_fleet_rolemaker_4',
+    'test_fleet_rolemaker_3',
+    'test_fleet_rolemaker',
+    'test_fleet_nocvm_1',
+    'test_fleet_base_4',
+    'test_fleet',
+    'test_fleet',
     'test_flags_use_mkldnn',
+    'test_flags_mkldnn_ops_on_off',
     'test_filter_by_instag_op',
     'test_fetch_var',
     'test_fetch_handler',
     'test_feed_fetch_method',
     'test_fc_mkldnn_op',
     'test_fc_lstm_fuse_pass',
+    'test_fc_lstm_fuse_pass',
     'test_fc_gru_fuse_pass',
+    'test_fc_gru_fuse_pass',
+    'test_fc_elementwise_layernorm_fuse_pass',
     'test_fc_bf16_mkldnn_op',
-    'test_entry_attr',
+    'test_executor_feed_non_tensor',
+    'test_executor_check_feed',
+    'test_executor_and_use_program_cache',
+    'test_exception',
+    'test_error_clip',
     'test_entry_attr2',
+    'test_entry_attr',
+    'test_embedding_eltwise_layernorm_fuse_pass',
     'test_elementwise_mul_bf16_mkldnn_op',
+    'test_elementwise_add_bf16_mkldnn_op',
     'test_eager_deletion_recurrent_op',
     'test_eager_deletion_padding_rnn',
     'test_eager_deletion_mnist',
@@ -240,203 +269,658 @@
     'test_eager_deletion_conditional_block',
     'test_dynrnn_static_input',
     'test_dynrnn_gradient_check',
+    'test_dyn_rnn',
     'test_dygraph_mode_of_unittest',
+    'test_dpsgd_op',
+    'test_downpoursgd',
     'test_download',
     'test_distributions',
+    'test_distributed_reader',
+    'test_directory_migration',
     'test_detection_map_op',
     'test_desc_clone',
+    'test_dequantize_mkldnn_op',
     'test_depthwise_conv_mkldnn_pass',
     'test_deprecated_memory_optimize_interfaces',
     'test_default_scope_funcs',
     'test_default_dtype',
+    'test_debugger',
+    'test_dataset_wmt',
     'test_dataset_voc',
+    'test_dataset_uci_housing',
     'test_dataset_movielens',
     'test_dataset_imikolov',
+    'test_dataset_imdb',
     'test_dataset_conll05',
+    'test_dataset_cifar',
+    'test_dataloader_unkeep_order',
+    'test_dataloader_keep_order',
+    'test_dataloader_dataset',
     'test_data_generator',
+    'test_data_feeder',
     'test_data',
     'test_cyclic_cifar_dataset',
+    'test_cudnn_placement_pass',
     'test_crypto',
+    'test_crf_decoding_op',
+    'test_create_parameter',
     'test_create_op_doc_string',
     'test_create_global_var',
+    'test_cpu_quantize_squash_pass',
+    'test_cpu_quantize_placement_pass',
+    'test_cpu_quantize_pass',
+    'test_cpu_bfloat16_placement_pass',
+    'test_cpu_bfloat16_pass',
+    'test_conv_elementwise_add_mkldnn_fuse_pass',
+    'test_conv_concat_relu_mkldnn_fuse_pass',
+    'test_conv_bias_mkldnn_fuse_pass',
+    'test_conv_batch_norm_mkldnn_fuse_pass',
+    'test_conv_activation_mkldnn_fuse_pass',
     'test_conv3d_transpose_layer',
+    'test_conv3d_mkldnn_op',
+    'test_conv3d_layer',
     'test_conv2d_transpose_layer',
     'test_conv2d_mkldnn_op',
     'test_conv2d_layer',
     'test_conv2d_int8_mkldnn_op',
     'test_conv2d_bf16_mkldnn_op',
+    'test_context_manager',
     'test_const_value',
     'test_conditional_block',
     'test_concat_int8_mkldnn_op',
+    'test_concat_bf16_mkldnn_op',
     'test_compat',
-    'test_collective_base',
-    'test_collective_api_base',
+    'test_common_infer_shape_functions',
     'test_chunk_eval_op',
+    'test_check_import_scipy',
+    'test_c_comm_init_all_op',
+    'test_calc_gradient',
     'test_broadcast_to_op',
     'test_broadcast_shape',
     'test_broadcast_error',
+    'test_broadcast',
     'test_bpr_loss_op',
+    'test_boxps',
+    'test_bipartite_match_op',
+    'test_benchmark',
     'test_beam_search_op',
     'test_batch_sampler',
+    'test_batch_norm_act_fuse_pass',
     'test_basic_rnn_name',
+    'test_attention_lstm_op',
+    'test_analyzer',
+    'test_allreduce',
+    'test_allgather',
     'test_aligned_allocator',
+    'system_allocator_test',
+    'stringprintf_test',
+    'stringpiece_test',
+    'split_test',
+    'selected_rows_test',
+    'selected_rows_functor_test',
+    'scope_test',
     'scatter_test',
+    'save_quant2_model_resnet50',
+    'save_quant2_model_gru',
+    'save_quant2_model_ernie',
+    'save_load_util_test',
+    'save_load_op_test',
     'save_load_combine_op_test',
+    'rw_lock_test',
+    'retry_allocator_test',
+    'reader_test',
+    'reader_blocking_queue_test',
+    'prune_test',
     'program_desc_test',
-    'lodtensor_printer_test',
-    'lod_tensor_test',
-    'gather_test',
-    'gather_op_test',
-    'fused_broadcast_op_test',
-    'exception_holder_test',
-    'decorator_test',
-    'ddim_test',
-    'data_layout_transform_test',
-    'cpu_vec_test',
-    'cow_ptr_tests',
-    'conditional_block_op_test',
-    'bfloat16_test',
-    'assign_op_test',
-    'unroll_array_ops_test',
-    'test_seqpool_cvm_concat_fuse_pass',
-    'test_seqpool_concat_fuse_pass',
-    'test_reshape_bf16_op',
-    'test_repeated_fc_relu_fuse_pass',
-    'test_py_reader_return_list',
-    'test_py_reader_lod_level_share',
-    'test_protobuf_descs',
-    'test_paddle_inference_api',
-    'test_operator_desc',
-    'test_operator',
-    'test_mkldnn_matmul_op_output_fuse_pass',
-    'test_mkldnn_inplace_pass',
-    'test_mkldnn_conv_concat_relu_mkldnn_fuse_pass',
-    'test_layer',
-    'test_is_test_pass',
-    'test_graph_pattern_detector',
-    'test_fusion_seqpool_cvm_concat_op',
-    'test_fusion_seqpool_concat_op',
-    'test_fusion_seqexpand_concat_fc_op',
-    'test_fusion_gru_mkldnn_op',
-    'test_fleet_util',
-    'test_fleet_runtime',
-    'test_fleet_rolemaker_init',
-    'test_flags_mkldnn_ops_on_off',
-    'test_dataset_download',
-    'test_dataloader_unkeep_order',
-    'test_dataloader_keep_order',
-    'test_dataloader_dataset',
-    'test_crf_decoding_op',
-    'test_create_parameter',
-    'test_context_manager',
-    'test_analyzer',
-    'tensor_test',
-    'split_test',
-    'save_load_op_test',
+    'profiler_test',
     'place_test',
+    'pass_test',
     'op_version_registry_test',
+    'op_tester',
     'op_proto_maker_test',
     'op_kernel_type_test',
-    'mask_util_test',
-    'inlined_vector_test',
-    'infer_io_utils_tester',
-    'errors_test',
-    'enforce_test',
-    'dropout_op_test',
-    'data_type_test',
-    'cpu_info_test',
-    'cpu_helper_test',
-    'beam_search_decode_op_test',
-    'auto_growth_best_fit_allocator_test',
-    'test_skip_layernorm_fuse_pass',
-    'test_multihead_matmul_fuse_pass',
-    'test_fc_elementwise_layernorm_fuse_pass',
-    'version_test',
-    'variable_test',
-    'test_scale_matmul_fuse_pass',
-    'test_reshape_transpose_matmul_mkldnn_fuse_pass',
-    'test_multi_gru_seq_fuse_pass',
-    'test_multi_gru_fuse_pass',
-    'test_mkldnn_placement_pass',
-    'test_mkldnn_op_nhwc',
-    'test_matmul_transpose_reshape_fuse_pass',
-    'test_fs',
-    'test_fleet',
-    'test_cpu_quantize_squash_pass',
-    'test_cpu_quantize_placement_pass',
-    'test_cpu_quantize_pass',
-    'test_cpu_bfloat16_placement_pass',
-    'test_cpu_bfloat16_pass',
-    'test_conv_elementwise_add_mkldnn_fuse_pass',
-    'test_conv_concat_relu_mkldnn_fuse_pass',
-    'test_conv_bias_mkldnn_fuse_pass',
-    'test_conv_batch_norm_mkldnn_fuse_pass',
-    'test_conv_activation_mkldnn_fuse_pass',
-    'test_benchmark',
-    'test_batch_norm_act_fuse_pass',
-    'selected_rows_functor_test',
-    'save_load_util_test',
-    'pass_test',
     'operator_test',
     'operator_exception_test',
     'op_debug_string_test',
     'op_compatible_info_test',
     'op_call_stack_test',
-    'node_test',
     'no_need_buffer_vars_inference_test',
+    'node_test',
     'nccl_context_test',
+    'mmap_allocator_test',
     'math_function_test',
+    'mask_util_test',
+    'lod_tensor_test',
+    'test_check_abi',
+    'lodtensor_printer_test',
+    'jit_kernel_test',
+    'test_dispatch_jit',
+    'inlined_vector_test',
     'init_test',
+    'infer_io_utils_tester',
     'graph_to_program_pass_test',
     'graph_test',
     'graph_helper_test',
+    'gather_test',
+    'gather_op_test',
+    'fused_broadcast_op_test',
     'float16_test',
+    'exception_holder_test',
+    'errors_test',
+    'enforce_test',
+    'eigen_test',
+    'dropout_op_test',
+    'dlpack_tensor_test',
     'dist_multi_trainer_test',
+    'dim_test',
+    'device_worker_test',
+    'decorator_test',
+    'ddim_test',
+    'data_type_test',
+    'test_check_error',
+    'data_layout_transform_test',
+    'cudnn_helper_test',
+    'cudnn_desc_test',
+    'cpu_vec_test',
+    'cpu_info_test',
+    'cpu_helper_test',
+    'cow_ptr_tests',
+    'convert_model2dot_ernie',
+    'conditional_block_op_test',
     'cipher_utils_test',
+    'check_reduce_rank_test',
+    'buffered_allocator_test',
     'broadcast_op_test',
+    'bfloat16_test',
+    'beam_search_decode_op_test',
+    'auto_growth_best_fit_allocator_test',
+    'assign_op_test',
+    'allocator_facade_frac_flags_test',
     'aes_cipher_test',
 ]
 
 # It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
 # just remove it from this list.
 TETRAD_PARALLEL_JOB = [
-    'system_allocator_test',
     'buffered_allocator_test',
-    'test_tensor_to_numpy',
+    'allocator_facade_frac_flags_test',
+    'cuda_helper_test',
+    'sequence_padding_test',
+    'test_auto_growth_gpu_memory_limit',
     'test_imperative_framework',
+    'device_context_test',
+    'test_reference_count_pass_last_lived_ops',
+    'copy_same_tensor_test',
+    'float16_gpu_test',
+    'test_leaky_relu_grad_grad_functor',
+    'sequence_pooling_test',
+    'mixed_vector_test',
+    'op_registry_test',
+    'strided_memcpy_test',
+    'selected_rows_functor_gpu_test',
+    'test_prepare_op',
+    'data_device_transform_test',
+    'test_tensor_to_numpy',
     'test_naive_best_fit_gpu_memory_limit',
-    'test_auto_growth_gpu_memory_limit',
+    'vol2col_test',
     'test_imperative_using_non_zero_gpu',
-    'cuda_helper_test',
     'retry_allocator_test',
-    'allocator_facade_frac_flags_test',
+    'system_allocator_test',
+    'test_fc_fuse_pass_cc',
+    'test_fc_lstm_fuse_pass_cc',
+    'test_fc_gru_fuse_pass_cc',
+    'test_conv_bn_fuse_pass_cc',
+    'test_adaptive_pool2d_convert_global_pass',
+    'test_unsqueeze2_eltwise_fuse_pass',
+    'test_layer_norm_fuse_pass_cc',
+    'test_fc_act_mkldnn_fuse_pass',
+    'test_fleet_cc',
+    'tensor_test',
+    'test_repeated_fc_relu_fuse_pass_cc',
+    'test_mkldnn_caching',
+]
+
+# It run 2 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
+# just remove it from this list.
+TWO_PARALLEL_JOB = [
+    'im2col_test',
+    'test_elementwise_add_grad_grad',
+    'test_logical_op',
+    'test_imperative_mnist',
+    'test_imperative_deepcf',
+    'test_cholesky_op',
+    'test_multiprocess_dataloader_iterable_dataset_static',
+    'test_sample_logits_op',
+    'test_ir_fc_fuse_pass',
+    'test_imperative_qat_channelwise',
+    'test_fleet_base_single',
+    'test_imperative_out_scale',
+    'test_multiprocess_dataloader_iterable_dataset_dynamic',
+    'test_fill_op',
+    'test_slice_op',
+    'test_cond',
+    'test_compiled_program',
+    'test_lstm',
+    'test_ema',
+    'test_py_reader_using_executor',
+    'test_nan_inf',
+    'test_isinstance',
+    'test_jit_save_load',
+    'test_box_clip_op',
+    'test_group_norm_op',
+    'test_seed_op',
+    'test_activation_nn_grad',
+    'test_pool2d_int8_mkldnn_op',
+    'test_adagrad_op_v2',
+    'test_elementwise_add_op',
+    'test_nn_functional_hot_op',
+    'test_op_name_conflict',
+    'test_softmax_with_cross_entropy_op',
+    'test_imperative_gan',
+    'test_simnet',
+    'test_instance_norm_op',
+    'test_amp_check_finite_and_scale_op',
+    'test_random_seed',
+    'test_histogram_op',
+    'test_sequence_conv',
+    'test_eye_op',
+    'test_row_conv_op',
+    'test_full_like_op',
+    'test_optimizer_in_control_flow',
+    'test_gru_unit_op',
+    'test_distribute_fpn_proposals_op',
+    'test_log_loss_op',
+    'test_adadelta_op',
+    'test_diag_embed',
+    'test_unsqueeze2_op',
+    'test_fused_fc_elementwise_layernorm_op',
+    'test_sum_bf16_mkldnn_op',
+    'test_sequence_erase_op',
+    'test_sigmoid_cross_entropy_with_logits_op',
+    'test_regularizer_api',
+    'test_lrn_op',
+    'test_rank_attention_op',
+    'test_parallel_ssa_graph_inference_feed_partial_data',
+    'test_lod_reset_op',
+    'test_install_check',
+    'test_anchor_generator_op',
+    'test_imperative_ptb_rnn',
+    'test_gather_nd_op',
+    'test_flatten_contiguous_range_op',
+    'test_network_with_dtype',
+    'test_elementwise_sub_op',
+    'test_assert_op',
+    'test_elementwise_div_op',
+    'test_gather_tree_op',
+    'test_decoupled_py_reader',
+    'test_imperative_named_members',
+    'test_conv3d_op',
+    'test_seqconv_eltadd_relu_fuse_pass',
+    'test_analysis_predictor',
+    'test_convert_operators',
+    'test_add_reader_dependency',
+    'test_is_tensor',
+    'test_variable',
+    'test_unsqueeze_op',
+    'test_save_model_without_var',
+    'test_unfold_op',
+    'test_conv_bn_fuse_pass',
+    'test_truncated_gaussian_random_op',
+    'test_tree_conv_op',
+    'test_traced_layer_err_msg',
+    'test_unique_with_counts',
+    'test_auc_single_pred_op',
+    'test_stack_op',
+    'test_conv_bn_fuse_pass',
+    'test_instance_norm_op_v2',
+    'test_softmax_bf16_mkldnn_op',
+    'test_mean_iou',
+    'test_sequence_slice_op',
+    'test_polygon_box_transform',
+    'test_sequence_pad_op',
+    'test_sequence_expand',
+    'test_cudnn_grucell',
+    'test_pool2d_bf16_mkldnn_op',
+    'test_bilinear_api',
+    'test_parallel_executor_inference_feed_partial_data',
+    'test_initializer_nn',
+    'test_modified_huber_loss_op',
+    'test_lookup_table_op',
+    'test_conv1d_layer',
+    'test_kron_op',
+    'test_isfinite_v2_op',
+    'test_ctc_align',
+    'test_imperative_save_load_v2',
+    'test_decayed_adagrad_op',
+    'test_generator_dataloader',
+    'test_dropout_op',
+    'test_functional_conv3d',
+    'test_executor_return_tensor_not_overwriting',
+    'test_flatten2_op',
+    'test_fsp_op',
+    'test_fusion_transpose_flatten_concat_op',
+    'test_elementwise_nn_grad',
+    'test_hinge_loss_op',
+    'test_elementwise_add_mkldnn_op',
+    'test_optimizer',
+    'test_deformable_conv_op',
+    'test_py_reader_push_pop',
+    'test_random_crop_op',
+    'test_shuffle_channel_op',
+    'test_center_loss',
+    'test_temporal_shift_op',
+    'test_case',
+    'test_transformer_api',
+    'test_bmm_op',
+    'test_adagrad_op',
+    'test_batch_norm_mkldnn_op',
+    'test_adam_op_multi_thread',
+    'test_adamax_op',
+    'test_while_loop_op',
+    'test_affine_grid_function',
+    'test_trilinear_interp_op',
+    'test_transpose_flatten_concat_fuse_pass',
+    'test_trace_op',
+    'test_backward',
+    'test_top_k_op',
+    'test_batch_fc_op',
+    'test_tensor_scalar_type_promotion_static',
+    'test_squared_l2_distance_op',
+    'test_bicubic_interp_op',
+    'test_spp_op',
+    'test_space_to_depth_op',
+    'test_callbacks',
+    'test_sigmoid_focal_loss_op',
+    'test_collect_fpn_proposals_op',
+    'test_sgd_op',
+    'test_sequence_unpad_op',
+    'test_conv1d_transpose_layer',
+    'test_sequence_slice_op',
+    'test_sequence_pool',
+    'test_conv_elementwise_add_fuse_pass',
+    'test_sequence_pad_op',
+    'test_conv_shift_op',
+    'test_sequence_expand_as',
+    'test_cos_sim_op',
+    'test_sequence_enumerate_op',
+    'test_cross_entropy2_op',
+    'test_sequence_concat',
+    'test_cudnn_lstmcell',
+    'test_data_norm_op',
+    'test_decoupled_py_reader_data_check',
+    'test_deformable_conv_v1_op',
+    'test_roi_align_op',
+    'test_detach',
+    'test_rnn_cells',
+    'test_elementwise_floordiv_op',
+    'test_elementwise_min_op',
+    'test_reduce_op',
+    'test_embedding_id_stop_gradient',
+    'test_empty_op',
+    'test_py_reader_combination',
+    'test_ptb_lm',
+    'test_expand_op',
+    'test_prroi_pool_op',
+    'test_fake_dequantize_op',
+    'test_fetch_feed',
+    'test_prelu_op',
+    'test_fill_zeros_like_op',
+    'test_pool2d_op',
+    'test_for_enumerate',
+    'test_gather_op',
+    'test_partial_concat_op',
+    'test_gaussian_random_op',
+    'test_paddle_imperative_double_grad',
+    'test_generate_proposals_v2_op',
+    'test_pad_constant_like',
+    'test_grid_sample_function',
+    'test_pad2d_op',
+    'test_huber_loss_op',
+    'test_one_hot_op',
+    'test_normal',
+    'test_imperative_auto_prune',
+    'test_nn_grad',
+    'test_nearest_interp_op',
+    'test_minus_op',
+    'test_imperative_reinforcement',
+    'test_maxout_op',
+    'test_matmul_op',
+    'test_increment',
+    'test_masked_select_op',
+    'test_lstmp_op',
+    'test_loop',
+    'test_label_smooth_op',
+    'test_logsumexp',
+    'test_log_softmax',
+    'test_learning_rate_scheduler',
+    'test_linspace',
+    'test_linear_interp_op',
+    'test_layer_norm_op_v2',
+    'test_lamb_op',
+    'test_lookup_table_v2_op',
+    'test_l1_norm_op',
+    'test_lstm_op',
+    'test_margin_rank_loss_op',
+    'test_index_sample_op',
+    'test_imperative_static_runner_while',
+    'test_imperative_save_load',
+    'test_imperative_ptb_rnn_sorted_gradient',
+    'test_mul_op',
+    'test_imperative_lod_tensor_to_selected_rows',
+    'test_imperative_data_parallel',
+    'test_norm_nn_grad',
+    'test_im2sequence_op',
+    'test_if_else_op',
+    'test_one_hot_v2_op',
+    'test_grid_sampler_op',
+    'test_pad_op',
+    'test_generate_proposals_op',
+    'test_parameter',
+    'test_gaussian_random_mkldnn_op',
+    'test_partial_sum_op',
+    'test_ftrl_op',
+    'test_flip',
+    'test_pool_max_op',
+    'test_prior_box_op',
+    'test_fake_quantize_op',
+    'test_proximal_gd_op',
+    'test_expand_v2_op',
+    'test_psroi_pool_op',
+    'test_expand_as_v2_op',
+    'test_ptb_lm_v2',
+    'test_rand_op',
+    'test_empty_like_op',
+    'test_rank_loss_op',
+    'test_elementwise_mod_op',
+    'test_reinforcement_learning',
+    'test_elementwise_max_op',
+    'test_retain_graph',
+    'test_edit_distance_op',
+    'test_reverse_op',
+    'test_device_guard',
+    'test_rnn_cells_static',
+    'test_deformable_psroi_pooling',
+    'test_roi_perspective_transform_op',
+    'test_segment_ops',
+    'test_cvm_op',
+    'test_selu_op',
+    'test_cross_op',
+    'test_sequence_conv',
+    'test_crop_tensor_op',
+    'test_sequence_expand',
+    'test_sequence_mask',
+    'test_conv_nn_grad',
+    'test_sequence_pool',
+    'test_conv_elementwise_add2_act_fuse_pass',
+    'test_sequence_reshape',
+    'test_conv2d_fusion_op',
+    'test_sequence_softmax_op',
+    'test_sequence_unpad_op',
+    'test_compare_reduce_op',
+    'test_clip_by_norm_op',
+    'test_box_coder_op',
+    'test_smooth_l1_loss_op',
+    'test_bilinear_interp_op',
+    'test_spectral_norm_op',
+    'test_sum_mkldnn_op',
+    'test_batch_norm_op',
+    'test_base_layer',
+    'test_argsort_op',
+    'test_arg_min_max_op',
+    'test_transpose_op',
+    'test_affine_grid_op',
+    'test_unpool_op',
+    'test_addmm_op',
+    'test_adam_optimizer_fp32_fp64',
+    'test_auc_op',
+    'test_adam_op',
+    'test_bilinear_tensor_product_op',
+    'test_break_continue',
+    'test_transpose_mkldnn_op',
+    'test_callback_reduce_lr_on_plateau',
+    'test_cast_op',
+    'test_scatter_nd_op',
+    'test_conv2d_transpose_op_depthwise_conv',
+    'test_queue',
+    'test_cross_entropy_op',
+    'test_detection',
+    'test_elementwise_mul_mkldnn_op',
+    'test_grid_generator',
+    'test_functional_conv2d',
+    'test_fit_a_line',
+    'test_fill_any_like_op',
+    'test_functional_conv2d_transpose',
+    'test_functional_conv3d_transpose',
+    'test_dot_op',
+    'test_gru_op',
+    'test_device',
+    'test_imperative_layer_apply',
+    'test_dataloader_early_reset',
+    'test_imperative_selected_rows_to_lod_tensor',
+    'test_crop_op',
+    'test_linear_interp_v2_op',
+    'test_lr_scheduler',
+    'test_tensor_array_to_tensor',
+    'test_mean_op',
+    'test_momentum_op',
+    'test_iou_similarity_op',
+    'test_optimizer_grad',
+    'test_dygraph_weight_norm',
+    'test_batch_norm_op_v2',
+    'test_pool2d_mkldnn_op',
+    'test_regularizer',
+    'test_sequence_concat',
+    'test_sequence_expand_as',
+    'test_sequence_reverse',
+    'test_shape_op',
+    'test_lod_tensor',
+    'test_diag',
+    'test_strided_slice_op',
+    'test_switch_case',
+    'test_target_assign_op',
+    'test_translated_layer',
+    'test_isfinite_op',
+    'test_conv_elementwise_add_act_fuse_pass',
+    'test_unbind_op',
+    'test_size_op',
+    'test_unique',
+    'test_unstack_op',
+    'test_wrappers',
+    'test_deprecated_decorator',
+    'test_affine_channel_op',
+    'test_arange',
+    'test_lrn_mkldnn_op',
+    'test_imperative_gnn',
+    'test_eager_deletion_while_op',
+    'test_dequantize_abs_max_op',
+    'test_elementwise_mul_op',
+    'test_tensor_scalar_type_promotion_dynamic',
+    'test_fc_op',
+    'test_mish_op',
+    'test_flatten_op',
+    'test_gradient_clip',
+    'test_allclose_layer',
+    'test_meshgrid_op',
+    'test_get_places_op',
+    'test_reader_reset',
+    'test_squared_l2_norm_op',
+    'test_softmax_mkldnn_op',
+    'test_numel_op',
+    'test_squeeze2_op',
+    'test_dygraph_mnist_fp16',
+    'test_activation_mkldnn_op',
+    'test_imperative_layer_children',
+    'test_nearest_interp_v2_op',
+    'test_fill_zeros_like2_op',
+    'test_sync_batch_norm_op',
+    'test_static_save_load',
+    'test_coalesce_tensor_op',
+    'test_fuse_bn_act_pass',
+    'test_simnet_v2',
+    'test_shard_index_op',
+    'test_cuda_random_seed',
+    'test_dequantize_log_op',
+    'test_mkldnn_batch_norm_act_fuse_pass',
+    'test_imperative_skip_op',
+    'test_proximal_adagrad_op',
+    'test_word2vec',
+    'test_conv2d_transpose_mkldnn_op',
+    'test_imperative_optimizer',
+    'test_assign_value_op',
+    'test_roi_pool_op',
+    'test_imperative_basic',
+    'test_word2vec',
+    'test_manual_seed',
+    'test_buffer_shared_memory_reuse_pass',
+    'test_range',
+    'test_activation_op',
+    'test_box_decoder_and_assign_op',
+    'test_imperative_optimizer_v2',
+    'test_python_operator_overriding',
+    'test_is_empty_op',
+    'test_imperative_qat',
+    'test_py_reader_pin_memory',
+    'test_train_recognize_digits',
+    'test_parallel_executor_feed_persistable_var',
+    'test_mnist',
+    'test_update_loss_scaling_op',
+    'test_rnn_cell_api',
+    'test_parallel_executor_fetch_isolated_var',
+    'test_imperative_load_static_param',
+    'test_fuse_bn_add_act_pass',
+    'test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass',
 ]
 
 
 def main():
-    eight_parallel_job = '^job$'
+    cpu_parallel_job = '^job$'
     tetrad_parallel_job = '^job$'
-    non_parallel_job_1 = '^job$'
-    non_parallel_job_2 = '^job$'
+    two_parallel_job = '^job$'
+    non_parallel_job = '^job$'
 
     test_cases = sys.argv[1]
     test_cases = test_cases.split("\n")
-    for unittest in test_cases:
-        if unittest in CPU_PARALLEL_JOB:
-            eight_parallel_job = eight_parallel_job + '|^' + unittest + '$'
-            continue
-        if unittest in TETRAD_PARALLEL_JOB:
+
+    for unittest in CPU_PARALLEL_JOB:
+        if unittest in test_cases:
+            cpu_parallel_job = cpu_parallel_job + '|^' + unittest + '$'
+            test_cases.remove(unittest)
+
+    for unittest in TETRAD_PARALLEL_JOB:
+        if unittest in test_cases:
             tetrad_parallel_job = tetrad_parallel_job + '|^' + unittest + '$'
-            continue
+            test_cases.remove(unittest)
 
-        if len(non_parallel_job_1) < 10000:
-            non_parallel_job_1 = non_parallel_job_1 + '|^' + unittest + '$'
-        else:
-            non_parallel_job_2 = non_parallel_job_2 + '|^' + unittest + '$'
+    for unittest in TWO_PARALLEL_JOB:
+        if unittest in test_cases:
+            two_parallel_job = two_parallel_job + '|^' + unittest + '$'
+            test_cases.remove(unittest)
+
+    for unittest in test_cases:
+        non_parallel_job = non_parallel_job + '|^' + unittest + '$'
 
-    non_parallel_job = ",".join([non_parallel_job_1, non_parallel_job_2])
-    print("{};{};{}".format(eight_parallel_job, tetrad_parallel_job,
-                            non_parallel_job))
+    print("{};{};{};{}".format(cpu_parallel_job, tetrad_parallel_job,
+                               two_parallel_job, non_parallel_job))
 
 
 if __name__ == '__main__':
diff --git a/tools/static_mode_white_list.pyc b/tools/static_mode_white_list.pyc
deleted file mode 100644
index e9012c233595b6..00000000000000
Binary files a/tools/static_mode_white_list.pyc and /dev/null differ
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index dd4b21c80d910b..8d52c1b84ae1b7 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -16,6 +16,7 @@ set -e
 set +x
 NIGHTLY_MODE=$1
 PRECISION_TEST=$2
+WITH_GPU=$3
 
 export PADDLE_ROOT="$(cd "$PWD/../" && pwd )"
 if [ ${NIGHTLY_MODE:-OFF} == "ON" ]; then
@@ -36,6 +37,14 @@ else
     disable_ut_quickly=''
 fi
 
+# check added ut
+set +e
+cp $PADDLE_ROOT/tools/check_added_ut.sh $PADDLE_ROOT/tools/check_added_ut_win.sh
+bash $PADDLE_ROOT/tools/check_added_ut_win.sh
+rm -rf $PADDLE_ROOT/tools/check_added_ut_win.sh
+set -e
+
+
 # /*==================Fixed Disabled Windows unittests==============================*/
 # TODO: fix these unittest that is bound to fail
 diable_wingpu_test="^lite_mul_model_test$|\
@@ -204,50 +213,50 @@ long_time_test="^best_fit_allocator_test$|\
 ^test_strided_slice_op$|\
 ^test_transpose_op$"
 
-export FLAGS_call_stack_level=2
-export FLAGS_fraction_of_gpu_memory_to_use=0.92
-export CUDA_VISIBLE_DEVICES=0
+if [ ${WITH_GPU:-OFF} == "ON" ];then
+    export FLAGS_call_stack_level=2
+    export FLAGS_fraction_of_gpu_memory_to_use=0.92
+    export CUDA_VISIBLE_DEVICES=0
 
-UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
-num=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d' | wc -l)
-echo "Windows 1 card TestCases count is $num"
-if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
-    python ${PADDLE_ROOT}/tools/get_pr_ut.py
-    if [[ -f "ut_list" ]]; then
-        set +x
-        echo "PREC length: "`wc -l ut_list`
-        precision_cases=`cat ut_list`
-        set -x
+    UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
+    num=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d' | wc -l)
+    echo "Windows 1 card TestCases count is $num"
+    if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
+        python ${PADDLE_ROOT}/tools/get_pr_ut.py
+        if [[ -f "ut_list" ]]; then
+            set +x
+            echo "PREC length: "`wc -l ut_list`
+            precision_cases=`cat ut_list`
+            set -x
+        fi
     fi
-fi
 
-set +e
-if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]];then
-    UT_list_prec=''
-    re=$(cat ut_list|awk -F ' ' '{print }' | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"$|^"$1}} END{print "^"all_str"$"}')
-    for case in $UT_list; do
-        flag=$(echo $case|grep -oE $re)
-        if [ -n "$flag" ];then
-            if [ -z "$UT_list_prec" ];then
-                UT_list_prec=$case
+    set +e
+    if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]];then
+        UT_list_prec=''
+        re=$(cat ut_list|awk -F ' ' '{print }' | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"$|^"$1}} END{print "^"all_str"$"}')
+        for case in $UT_list; do
+            flag=$(echo $case|grep -oE $re)
+            if [ -n "$flag" ];then
+                if [ -z "$UT_list_prec" ];then
+                    UT_list_prec=$case
+                else
+                    UT_list_prec=$UT_list_prec'\n'$case
+                fi
             else
-                UT_list_prec=$UT_list_prec'\n'$case
+                echo $case "won't run in PRECISION_TEST mode."
             fi
-        else
-            echo $case "won't run in PRECISION_TEST mode."
-        fi
-    done
-    UT_list=$UT_list_prec
-fi
-set -e
-
-output=$(python ${PADDLE_ROOT}/tools/parallel_UT_rule.py "${UT_list}")
-eight_parallel_job=$(echo $output | cut -d ";" -f 1)
-tetrad_parallel_jog=$(echo $output | cut -d ";" -f 2)
-non_parallel_job=$(echo $output | cut -d ";" -f 3)
+        done
+        UT_list=$UT_list_prec
+    fi
+    set -e
 
-non_parallel_job_1=$(echo $non_parallel_job | cut -d "," -f 1)
-non_parallel_job_2=$(echo $non_parallel_job | cut -d "," -f 2)
+    output=$(python ${PADDLE_ROOT}/tools/parallel_UT_rule.py "${UT_list}")
+    cpu_parallel_job=$(echo $output | cut -d ";" -f 1)
+    tetrad_parallel_job=$(echo $output | cut -d ";" -f 2)
+    two_parallel_job=$(echo $output | cut -d ";" -f 3)
+    non_parallel_job=$(echo $output | cut -d ";" -f 4)
+fi
 
 failed_test_lists=''
 tmp_dir=`mktemp -d`
@@ -267,13 +276,20 @@ function collect_failed_tests() {
     set -e
 }
 
-function run_unittest() {
+function run_unittest_cpu() {
+    tmpfile=$tmp_dir/$RANDOM
+    (ctest -E "${disable_ut_quickly}" -LE "${nightly_label}" --output-on-failure -C Release -j 8 | tee $tmpfile) &
+    wait;
+}
+
+function run_unittest_gpu() {
     test_case=$1
     parallel_job=$2
+    parallel_level_base=${CTEST_PARALLEL_LEVEL:-1}
     if [ "$2" == "" ]; then
-        parallel_job=1
+        parallel_job=$parallel_level_base
     else
-        parallel_job=$2
+        parallel_job=`expr $2 \* $parallel_level_base`
     fi
     echo "************************************************************************"
     echo "********These unittests run $parallel_job job each time with 1 GPU**********"
@@ -285,7 +301,11 @@ function run_unittest() {
 }
 
 function unittests_retry(){
-    parallel_job=1
+    if [ "${WITH_GPU:-OFF}" == "ON" ];then
+        parallel_job=1
+    else
+        parallel_job=4
+    fi
     is_retry_execuate=0
     wintest_error=1
     retry_time=3
@@ -336,7 +356,7 @@ function unittests_retry(){
 
 function show_ut_retry_result() {
     if [[ "$is_retry_execuate" != "0" ]];then
-        failed_test_lists_ult=`echo "${failed_test_lists}" | grep -Po '[^ ].*$'`
+        failed_test_lists_ult=`echo "${failed_test_lists}"`
         echo "========================================="
         echo "There are more than 10 failed unit tests, so no unit test retry!!!"
         echo "========================================="
@@ -349,7 +369,7 @@ function show_ut_retry_result() {
             echo "========================================"
             echo "There are failed tests, which have been successful after re-run:"
             echo "========================================"
-            echo "The following tests have been re-ran:"
+            echo "The following tests have been re-run:"
             echo "${retry_unittests_record}"
         else
             failed_ut_re=$(echo "${retry_unittests_record_judge}" | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"|"$1}} END{print all_str}')
@@ -365,10 +385,25 @@ function show_ut_retry_result() {
 }
 
 set +e
-run_unittest $eight_parallel_job 8
-run_unittest $tetrad_parallel_jog 4
-run_unittest $non_parallel_job_1
-run_unittest $non_parallel_job_2
+
+if [ "${WITH_GPU:-OFF}" == "ON" ];then
+    if [ -f "$PADDLE_ROOT/added_ut" ];then
+        added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$
+        ctest -R "(${added_uts})" --output-on-failure -C Release --repeat-until-fail 3;added_ut_error=$?
+        if [ "$added_ut_error" != 0 ];then
+            echo "========================================"
+            echo "Added UT should pass three additional executions"
+            echo "========================================"
+            exit 8;
+        fi
+    fi
+    run_unittest_gpu $cpu_parallel_job 12
+    run_unittest_gpu $tetrad_parallel_job 4
+    run_unittest_gpu $two_parallel_job 2
+    run_unittest_gpu $non_parallel_job
+else
+    run_unittest_cpu
+fi
 collect_failed_tests
 set -e
 rm -f $tmp_dir/*