diff --git a/CMakeLists.txt b/CMakeLists.txt
index 30f9e3a3dcdd2c..f30671bd3a87e8 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-cmake_minimum_required(VERSION 3.15)
+cmake_minimum_required(VERSION 3.10)
 cmake_policy(VERSION 3.10)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
@@ -22,9 +22,6 @@ include(system)
 
 project(paddle CXX C)
 
-include(init)
-include(generic)            # simplify cmake module
-
 # enable language CUDA
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
@@ -34,10 +31,14 @@ option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
 option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"       OFF)
-# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON 
+# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
 # to develop some acl related functionality on x86
 option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
 option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
+# Note(zhouwei): It use option above, so put here
+include(init)
+include(generic)            # simplify cmake module
+
 if (WITH_GPU  AND WITH_XPU)
     message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
@@ -65,7 +66,7 @@ if(WITH_MUSL)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
 endif()
 
-if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
     set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 endif()
 
@@ -103,9 +104,9 @@ if(WIN32)
             endif()
         endforeach(flag_var)
     endif()
-    
-    # NOTE(Avin0323): Less parallel count result in faster compilation.
+
     math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
+
     # windows build turn off warnings, use parallel compiling.
     foreach(flag_var
         CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
@@ -113,7 +114,10 @@ if(WIN32)
         CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
         CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
         string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-        set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
+        # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling
+        if(NOT WITH_GPU)
+            set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
+        endif()
     endforeach(flag_var)
     foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
         set(${flag_var} "${${flag_var}} /w")
@@ -133,6 +137,9 @@ if(WIN32)
 
     foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
         set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
+        if(MSVC_STATIC_CRT)
+            set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB")
+        endif()
     endforeach(flag_var)
 
     if (WITH_WIN_DUMP_DBG)
@@ -182,7 +189,6 @@ option(WITH_PSLIB       "Compile with pslib support"                    OFF)
 option(WITH_BOX_PS      "Compile with box_ps support"                   OFF)
 option(WITH_XBYAK       "Compile with xbyak support"                    ON)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
-option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_PSCORE     "Compile with parameter server support"         ${WITH_DISTRIBUTE})
 option(WITH_HETERPS     "Compile with heterps"                          OFF})
 option(WITH_INFERENCE_API_TEST   "Test fluid inference C++ high-level api interface"  OFF)
@@ -199,6 +205,7 @@ option(WITH_SW   "Compile PaddlePaddle with sw support"         OFF)
 option(WITH_MIPS   "Compile PaddlePaddle with mips support"         OFF)
 option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
 option(WITH_UNITY_BUILD "Compile with UnityBuild mode"             OFF)
+option(WITH_STRIP       "Strip so files of Whl packages"         OFF)
 
 # PY_VERSION
 if(NOT PY_VERSION)
@@ -259,9 +266,6 @@ endif()
 
 if(WITH_BRPC_RDMA)
     message(STATUS "Use brpc with rdma.")
-    if(WITH_GRPC)
-        message(FATAL_ERROR "Can't use grpc with brpc rdma.")
-    endif()
     if(NOT WITH_DISTRIBUTE)
         message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
     endif()
@@ -349,6 +353,11 @@ if (WITH_MIPS)
     add_definitions(-DPADDLE_WITH_MIPS)
 endif()
 
+if (WITH_HETERPS)
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new")
+    endif()
+endif()
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
@@ -366,6 +375,13 @@ else()
     message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
 endif()
 
+if(WITH_STRIP)
+    find_program(STRIP_PATH strip)
+    if(NOT STRIP_PATH OR NOT LINUX)
+        set(WITH_STRIP OFF CACHE STRING "Command strip is only used on Linux when it exists." FORCE)
+    endif()
+endif()
+
 add_subdirectory(paddle)
 if(WITH_PYTHON)
     add_subdirectory(python)
diff --git a/README.md b/README.md
index e8a7013d0b4432..8b437e4115abe8 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,4 @@
-﻿
-<p align="center">
+﻿<p align="center">
 <img align="center" src="doc/imgs/logo.png", width=1600>
 <p>
     
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index bf1352d4e11479..e7f125269be1f5 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -177,10 +177,6 @@ if(WITH_HETERPS)
     add_definitions(-DPADDLE_WITH_HETERPS)
 endif()
 
-if(WITH_GRPC)
-    add_definitions(-DPADDLE_WITH_GRPC)
-endif(WITH_GRPC)
-
 if(WITH_BRPC_RDMA)
     add_definitions(-DPADDLE_WITH_BRPC_RDMA)
 endif(WITH_BRPC_RDMA)
diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
index bddd2023b437b1..414b2a54be0342 100644
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -21,7 +21,13 @@ else()
     set(ASCEND_DIR /usr/local/Ascend)
 endif()
 
-if(WITH_ASCEND)
+if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h)
+  # It means CANN 20.2 +
+  add_definitions(-DPADDLE_WITH_ASCEND_STRING)
+endif()
+
+
+if(WITH_ASCEND OR WITH_ASCEND_CL)
   set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
   set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
   set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
@@ -43,9 +49,6 @@ if(WITH_ASCEND)
   set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so)
   INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
 
-  if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h)
-    add_definitions(-DPADDLE_WITH_ASCEND_STRING)
-  endif()
 
   ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
   SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
@@ -62,17 +65,23 @@ endif()
 if(WITH_ASCEND_CL)
   set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
 
+  set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
   set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
   set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
-  set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
+  set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include)
 
-  message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}")
+  message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}")
   message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}")
-  INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR})
+  INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR})
+  INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR})
 
   ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
   SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
 
+  ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
+
   ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
   SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
   add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 4619f9f7b7e34c..aa471002eacb6a 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -33,7 +33,9 @@ elseif(LINUX)
         # which will cause compiler error of using __host__ funciont in __host__ __device__
         file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src)
         file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst)
-        set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst})
+        file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorReductionGpu.h native_src1)
+        file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h native_dst1)
+        set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst} && cp ${native_src1} ${native_dst1})
     endif()
 endif()
 
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c85654a5674a00..a5c74a46631e9d 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -447,9 +447,20 @@ function(cc_test TARGET_NAME)
     cc_test_build(${TARGET_NAME}
 	    SRCS ${cc_test_SRCS}
 	    DEPS ${cc_test_DEPS})
-    cc_test_run(${TARGET_NAME}
-	    COMMAND ${TARGET_NAME}
-	    ARGS ${cc_test_ARGS})
+    # we dont test hcom op, because it need complex configuration
+    # with more than one machine
+    if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test"         OR
+            "${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test"         OR
+            "${TARGET_NAME}" STREQUAL "send_v2_op_npu_test"             OR
+            "${TARGET_NAME}" STREQUAL "c_reduce_sum_op_npu_test"        OR
+            "${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test"))
+      cc_test_run(${TARGET_NAME}
+        COMMAND ${TARGET_NAME}
+        ARGS ${cc_test_ARGS})
+    endif()
   endif()
 endfunction(cc_test)
 
@@ -807,7 +818,7 @@ function(py_test TARGET_NAME)
                ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endif()
-    
+
     if (WIN32)
         set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
     endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 4864e04fa05164..9694a7bc59c12a 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -211,11 +211,11 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 if(WIN32)
   set(paddle_inference_c_lib $<TARGET_FILE_DIR:paddle_inference_c>/paddle_inference_c.*)
 else(WIN32)
-  set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_inference_c.*)
+  set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi_exp/libpaddle_inference_c.*)
 endif(WIN32)
 
 copy(inference_lib_dist
-      SRCS  ${src_dir}/inference/capi/paddle_c_api.h  ${paddle_inference_c_lib}
+      SRCS  ${src_dir}/inference/capi_exp/pd_*.h  ${paddle_inference_c_lib}
       DSTS  ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib)
 
 # fluid library for both train and inference
diff --git a/cmake/init.cmake b/cmake/init.cmake
index 19fdb6c601a112..b11156d2e9986f 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -18,10 +18,10 @@ if(NOT WIN32)
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
     set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
 else()
-    # It has not been used now, it can specify CUDA compile flag manualy,
+    # It can specify CUDA compile flag manualy,
     # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
     # because CUDA will update by nvidia, then error will occur.
-    # Now, it's used in CUDA:[10.0, 10.2]
+    # Now, it's only used in VS2015 + CUDA:[10.0, 10.2]
     set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
 endif()
 
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 7dac91e531e4cf..75b1100caa915e 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -180,8 +180,8 @@ function(op_library TARGET)
         list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
         list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
         list(REMOVE_ITEM hip_srcs "cholesky_op.cu")
-        list(REMOVE_ITEM hip_srcs "correlation_op.cu")
         list(REMOVE_ITEM hip_srcs "multinomial_op.cu")
+        list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu")
         hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS}
                 ${op_common_deps})
     else()
diff --git a/cmake/paddle_win.props b/cmake/paddle_win.props
index 296940dc3f50cc..3c069bd2981c43 100644
--- a/cmake/paddle_win.props
+++ b/cmake/paddle_win.props
@@ -88,4 +88,3 @@ set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
         </ClCompile>
     </ItemDefinitionGroup>
 </Project>
-
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 81fa7d0dfa98f0..f90fa3509d63d4 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -29,9 +29,9 @@ set(third_party_deps)
 # 2. REPOSITORY:    specify git REPOSITORY of 3rd party
 # 3. TAG:           specify git tag/branch/commitID of 3rd party
 # 4. DIR:           overwrite the original SOURCE_DIR when cache directory
-# 
+#
 # The function Return 1 PARENT_SCOPE variables:
-#  - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add, 
+#  - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add,
 #                            and you no longer need to set any donwnload steps in ExternalProject_Add.
 # For example:
 #    Cache_third_party(${TARGET}
@@ -52,7 +52,7 @@ FUNCTION(cache_third_party TARGET)
         SET(${TARGET_NAME}_DOWNLOAD_CMD
                 GIT_REPOSITORY  ${cache_third_party_REPOSITORY})
         IF(cache_third_party_TAG)
-            LIST(APPEND   ${TARGET_NAME}_DOWNLOAD_CMD  
+            LIST(APPEND   ${TARGET_NAME}_DOWNLOAD_CMD
                     GIT_TAG     ${cache_third_party_TAG})
         ENDIF()
     ELSEIF(cache_third_party_URL)
@@ -130,7 +130,7 @@ ENDFUNCTION()
 # Correction of flags on different Platform(WIN/MAC) and Print Warning Message
 if (APPLE)
     if(WITH_MKL)
-        MESSAGE(WARNING 
+        MESSAGE(WARNING
             "Mac is not supported with MKL in Paddle yet. Force WITH_MKL=OFF.")
         set(WITH_MKL OFF CACHE STRING "Disable MKL for building on mac" FORCE)
     endif()
@@ -141,7 +141,7 @@ if(WIN32 OR APPLE)
     SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
 
     if(WITH_LIBXSMM)
-        MESSAGE(WARNING 
+        MESSAGE(WARNING
             "Windows, Mac are not supported with libxsmm in Paddle yet."
             "Force WITH_LIBXSMM=OFF")
         SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM in Windows and MacOS" FORCE)
@@ -276,7 +276,7 @@ endif(WITH_BOX_PS)
 
 if(WITH_ASCEND OR WITH_ASCEND_CL)
     include(external/ascend)
-    if(WITH_ASCEND)
+    if(WITH_ASCEND OR WITH_ASCEND_CL)
         list(APPEND third_party_deps extern_ascend)
     endif()
     if(WITH_ASCEND_CL)
@@ -290,7 +290,7 @@ if (WITH_PSCORE)
 
     include(external/leveldb)
     list(APPEND third_party_deps extern_leveldb)
-        
+
     include(external/brpc)
     list(APPEND third_party_deps extern_brpc)
 
diff --git a/go/demo/mobilenet_c_exp.cc b/go/demo/mobilenet_c_exp.cc
new file mode 100644
index 00000000000000..b4f42dab6790bf
--- /dev/null
+++ b/go/demo/mobilenet_c_exp.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <pd_inference_api.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void ReadData(float* data, int size);
+
+int main(int argc, char* argv[]) {
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, "data/model/__model__", "data/model/__params__");
+  PD_ConfigDisableGlogInfo(config);
+
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  // config has destroyed in PD_PredictorCreate
+  config = NULL;
+
+  int input_num = PD_PredictorGetInputNum(predictor);
+  printf("Input num: %d\n", input_num);
+  int output_num = PD_PredictorGetOutputNum(predictor);
+  printf("Output num: %d\n", output_num);
+
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* input_tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  PD_OneDimArrayCstrDestroy(input_names);
+  input_names = NULL;
+
+  int32_t shape[] = {1, 3, 300, 300};
+  float* data = (float*)malloc(sizeof(float) * 1 * 3 * 300 * 300);  // NOLINT
+  ReadData(data, 1 * 3 * 300 * 300);                                // NOLINT
+  PD_TensorReshape(input_tensor, 4, shape);
+  PD_TensorCopyFromCpuFloat(input_tensor, data);
+  free(data);
+  data = NULL;
+  PD_PredictorRun(predictor);
+
+  PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor* output_tensor =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_OneDimArrayCstrDestroy(output_names);
+  output_names = nullptr;
+
+  PD_OneDimArrayInt32* out_shape = PD_TensorGetShape(output_tensor);
+  int32_t size = 1;
+  for (size_t index = 0; index < out_shape->size; ++index) {
+    size = size * out_shape->data[index];
+  }
+  PD_OneDimArrayInt32Destroy(out_shape);
+  out_shape = NULL;
+
+  data = (float*)malloc(sizeof(float) * size);  // NOLINT
+  PD_TensorCopyToCpuFloat(output_tensor, data);
+  free(data);
+  data = NULL;
+
+  PD_TensorDestroy(output_tensor);
+  output_tensor = NULL;
+  PD_TensorDestroy(input_tensor);
+  input_tensor = NULL;
+  PD_PredictorDestroy(predictor);
+  predictor = NULL;
+
+  return 0;
+}
+
+void ReadData(float* data, int n) {
+  FILE* fp = fopen("data/data.txt", "r");
+  for (int i = 0; i < n; i++) {
+    fscanf(fp, "%f", &data[i]);
+  }
+  fclose(fp);
+}
diff --git a/paddle/extension.h b/paddle/extension.h
index 71469576853a33..98d4bfd0326c5c 100644
--- a/paddle/extension.h
+++ b/paddle/extension.h
@@ -15,4 +15,4 @@ limitations under the License. */
 #pragma once
 
 // All paddle apis in C++ frontend
-#include "paddle/fluid/extension/include/ext_all.h"
+#include "paddle/extension/include/ext_all.h"
diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index a2062d82c8130b..905347d031b35b 100644
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -11,8 +11,8 @@ if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
             "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new")
 endif()
 
-add_subdirectory(table)
 add_subdirectory(service)
+add_subdirectory(table)
 add_subdirectory(test)
 add_subdirectory(index_dataset)
 
diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/fleet.cc
index 9aafdd769ed4a0..dfd55f16e1a065 100644
--- a/paddle/fluid/distributed/fleet.cc
+++ b/paddle/fluid/distributed/fleet.cc
@@ -146,6 +146,44 @@ void FleetWrapper::CreateClient2ClientConnection() {
       client2client_max_retry_);
 }
 
+std::future<int32_t> FleetWrapper::PullSparseVarsAsync(
+    const Scope& scope, const uint64_t table_id,
+    const std::vector<std::string>& var_names, std::vector<uint64_t>* fea_keys,
+    std::vector<std::vector<float>>* fea_values, int fea_value_dim) {
+  fea_keys->clear();
+  fea_keys->resize(0);
+  fea_keys->reserve(MAX_FEASIGN_NUM);
+  for (auto name : var_names) {
+    Variable* var = scope.FindVar(name);
+    if (var == nullptr) {
+      continue;
+    }
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    CHECK(tensor != nullptr) << "tensor of var " << name << " is null";
+    int64_t* ids = tensor->data<int64_t>();
+    size_t len = tensor->numel();
+    for (auto i = 0u; i < len; ++i) {
+      if (ids[i] == 0u) {
+        continue;
+      }
+      fea_keys->push_back(static_cast<uint64_t>(ids[i]));
+    }
+  }
+  fea_values->resize(fea_keys->size() + 1);
+  for (auto& t : *fea_values) {
+    t.resize(fea_value_dim);
+  }
+  std::vector<float*> pull_result_ptr;
+  for (auto& t : *fea_values) {
+    pull_result_ptr.push_back(t.data());
+  }
+
+  bool training = true;
+  return pserver_ptr_->_worker_ptr->pull_sparse(pull_result_ptr.data(),
+                                                table_id, fea_keys->data(),
+                                                fea_keys->size(), training);
+}
+
 void FleetWrapper::PullSparseVarsSync(
     const Scope& scope, const uint64_t table_id,
     const std::vector<std::string>& var_names, std::vector<uint64_t>* fea_keys,
diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h
index 863440180a808d..0da5d1e2bf987f 100644
--- a/paddle/fluid/distributed/fleet.h
+++ b/paddle/fluid/distributed/fleet.h
@@ -84,6 +84,15 @@ class FleetWrapper {
                           int fea_dim,
                           const std::vector<std::string>& var_emb_names);
 
+  // Pull sparse variables from server in async mode
+  // Param<in>: scope, table_id, var_names, fea_keys, fea_dim
+  // Param<out>: fea_values std::future
+  std::future<int32_t> PullSparseVarsAsync(
+      const Scope& scope, const uint64_t table_id,
+      const std::vector<std::string>& var_names,
+      std::vector<uint64_t>* fea_keys,
+      std::vector<std::vector<float>>* fea_values, int fea_dim);
+
   // Pull sparse variables from server in sync mode
   // pull immediately to tensors
   // is_training is true means training, false means inference, the behavior is
diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.cc b/paddle/fluid/distributed/index_dataset/index_sampler.cc
index 58f85d98fb09c6..3e573bbdd2de97 100644
--- a/paddle/fluid/distributed/index_dataset/index_sampler.cc
+++ b/paddle/fluid/distributed/index_dataset/index_sampler.cc
@@ -13,13 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/distributed/index_dataset/index_sampler.h"
-#include "paddle/fluid/operators/math/sampler.h"
 
 namespace paddle {
 namespace distributed {
 
-using Sampler = paddle::operators::math::Sampler;
-
 std::vector<std::vector<uint64_t>> LayerWiseSampler::sample(
     const std::vector<std::vector<uint64_t>>& user_inputs,
     const std::vector<uint64_t>& target_ids, bool with_hierarchy) {
@@ -30,22 +27,7 @@ std::vector<std::vector<uint64_t>> LayerWiseSampler::sample(
       std::vector<uint64_t>(user_feature_num + 2));
 
   auto max_layer = tree_->Height();
-  std::vector<Sampler*> sampler_vec(max_layer - start_sample_layer_);
-  std::vector<std::vector<IndexNode>> layer_ids(max_layer -
-                                                start_sample_layer_);
-
-  auto layer_index = max_layer - 1;
   size_t idx = 0;
-  while (layer_index >= start_sample_layer_) {
-    auto layer_codes = tree_->GetLayerCodes(layer_index);
-    layer_ids[idx] = tree_->GetNodes(layer_codes);
-    sampler_vec[idx] = new paddle::operators::math::UniformSampler(
-        layer_ids[idx].size() - 1, seed_);
-    layer_index--;
-    idx++;
-  }
-
-  idx = 0;
   for (size_t i = 0; i < input_num; i++) {
     auto travel_codes =
         tree_->GetTravelCodes(target_ids[i], start_sample_layer_);
@@ -76,18 +58,15 @@ std::vector<std::vector<uint64_t>> LayerWiseSampler::sample(
       for (int idx_offset = 0; idx_offset < layer_counts_[j]; idx_offset++) {
         int sample_res = 0;
         do {
-          sample_res = sampler_vec[j]->Sample();
-        } while (layer_ids[j][sample_res].id() == travel_path[j].id());
+          sample_res = sampler_vec_[j]->Sample();
+        } while (layer_ids_[j][sample_res].id() == travel_path[j].id());
         outputs[idx + idx_offset][user_feature_num] =
-            layer_ids[j][sample_res].id();
+            layer_ids_[j][sample_res].id();
         outputs[idx + idx_offset][user_feature_num + 1] = 0;
       }
       idx += layer_counts_[j];
     }
   }
-  for (size_t i = 0; i < sampler_vec.size(); i++) {
-    delete sampler_vec[i];
-  }
   return outputs;
 }
 
diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.h b/paddle/fluid/distributed/index_dataset/index_sampler.h
index 66882bedc9b765..8813421446a21c 100644
--- a/paddle/fluid/distributed/index_dataset/index_sampler.h
+++ b/paddle/fluid/distributed/index_dataset/index_sampler.h
@@ -16,6 +16,7 @@
 #include <vector>
 #include "paddle/fluid/distributed/index_dataset/index_wrapper.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/sampler.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -83,6 +84,23 @@ class LayerWiseSampler : public IndexSampler {
     }
     reverse(layer_counts_.begin(), layer_counts_.end());
     VLOG(3) << "sample counts sum: " << layer_counts_sum_;
+
+    auto max_layer = tree_->Height();
+    sampler_vec_.clear();
+    layer_ids_.clear();
+
+    auto layer_index = max_layer - 1;
+    size_t idx = 0;
+    while (layer_index >= start_sample_layer_) {
+      auto layer_codes = tree_->GetLayerCodes(layer_index);
+      layer_ids_.push_back(tree_->GetNodes(layer_codes));
+      auto sampler_temp =
+          std::make_shared<paddle::operators::math::UniformSampler>(
+              layer_ids_[idx].size() - 1, seed_);
+      sampler_vec_.push_back(sampler_temp);
+      layer_index--;
+      idx++;
+    }
   }
   std::vector<std::vector<uint64_t>> sample(
       const std::vector<std::vector<uint64_t>>& user_inputs,
@@ -94,6 +112,8 @@ class LayerWiseSampler : public IndexSampler {
   std::shared_ptr<TreeIndex> tree_{nullptr};
   int seed_{0};
   int start_sample_layer_{1};
+  std::vector<std::shared_ptr<paddle::operators::math::Sampler>> sampler_vec_;
+  std::vector<std::vector<IndexNode>> layer_ids_;
 };
 
 }  // end namespace distributed
diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/service/brpc_ps_server.cc
index a9370561a540be..a1440260bf2e77 100644
--- a/paddle/fluid/distributed/service/brpc_ps_server.cc
+++ b/paddle/fluid/distributed/service/brpc_ps_server.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/distributed/service/brpc_ps_server.h"
 #include <thread>  // NOLINT
+#include "butil/object_pool.h"
 #include "paddle/fluid/distributed/table/depends/sparse_utils.h"
 #include "paddle/fluid/distributed/table/table.h"
 #include "paddle/fluid/framework/archive.h"
@@ -196,12 +197,13 @@ int32_t BrpcPsService::pull_dense(Table *table, const PsRequestMessage &request,
     return 0;
   }
 
-  std::vector<float> res_data;
-  res_data.resize(num * table->value_accesor()->select_size() / sizeof(float));
-  table->pull_dense(res_data.data(), num);
+  auto res_data = butil::get_object<std::vector<float>>();
+  res_data->resize(num * table->value_accesor()->select_size() / sizeof(float));
+  table->pull_dense(res_data->data(), num);
 
-  cntl->response_attachment().append((char *)res_data.data(),
-                                     res_data.size() * sizeof(float));
+  cntl->response_attachment().append((char *)(res_data->data()),
+                                     res_data->size() * sizeof(float));
+  butil::return_object(res_data);
 
   return 0;
 }
@@ -367,12 +369,13 @@ int32_t BrpcPsService::pull_sparse(Table *table,
 
   value.DeserializeFromBytes(const_cast<void *>(data));
 
-  std::vector<float> res_data;
-  res_data.resize(num * dim);
-  table->pull_sparse(res_data.data(), value);
+  auto res_data = butil::get_object<std::vector<float>>();
+  res_data->resize(num * dim);
+  table->pull_sparse(res_data->data(), value);
 
-  cntl->response_attachment().append((char *)res_data.data(),
-                                     res_data.size() * sizeof(float));
+  cntl->response_attachment().append((char *)(res_data->data()),
+                                     res_data->size() * sizeof(float));
+  butil::return_object(res_data);
   return 0;
 }
 
diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc
index a6271cac83c9a9..eafb4d596cc167 100644
--- a/paddle/fluid/distributed/service/graph_brpc_client.cc
+++ b/paddle/fluid/distributed/service/graph_brpc_client.cc
@@ -135,7 +135,8 @@ std::future<int32_t> GraphBrpcClient::get_node_feat(
     closure->request(request_idx)
         ->add_params(joint_feature_name.c_str(), joint_feature_name.size());
 
-    PsService_Stub rpc_stub(get_cmd_channel(server_index));
+    GraphPsService_Stub rpc_stub =
+        getServiceStub(get_cmd_channel(server_index));
     closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms());
     rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx),
                      closure->response(request_idx), closure);
diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h
index e185f23e3d240f..c6657be96ba446 100644
--- a/paddle/fluid/distributed/service/graph_py_service.h
+++ b/paddle/fluid/distributed/service/graph_py_service.h
@@ -54,19 +54,7 @@ class GraphPyService {
   std::vector<std::string> table_feat_conf_feat_dtype;
   std::vector<int32_t> table_feat_conf_feat_shape;
 
-  // std::thread *server_thread, *client_thread;
-
-  // std::shared_ptr<paddle::distributed::PSServer> pserver_ptr;
-
-  // std::shared_ptr<paddle::distributed::PSClient> worker_ptr;
-
  public:
-  // std::shared_ptr<paddle::distributed::PSServer> get_ps_server() {
-  //   return pserver_ptr;
-  // }
-  // std::shared_ptr<paddle::distributed::PSClient> get_ps_client() {
-  //   return worker_ptr;
-  // }
   int get_shard_num() { return shard_num; }
   void set_shard_num(int shard_num) { this->shard_num = shard_num; }
   void GetDownpourSparseTableProto(
diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt
index dde1f5ae8ee3a1..dab390958034af 100644
--- a/paddle/fluid/distributed/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/table/CMakeLists.txt
@@ -13,7 +13,11 @@ set_source_files_properties(sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTR
 set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
-cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator)
+get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
+
+cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc
+sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS}
+${RPC_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator)
 
 set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc
index 020bcdcc52ef4b..0dc99de1bfe82a 100644
--- a/paddle/fluid/distributed/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/table/common_graph_table.cc
@@ -171,7 +171,7 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
 
 int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
   auto paths = paddle::string::split_string<std::string>(path, ";");
-  int count = 0;
+  int64_t count = 0;
   std::string sample_type = "random";
   bool is_weighted = false;
   int valid_count = 0;
diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h
index 8ddf3c8f904a6c..b18da82abe61c9 100644
--- a/paddle/fluid/distributed/table/common_graph_table.h
+++ b/paddle/fluid/distributed/table/common_graph_table.h
@@ -33,26 +33,11 @@ namespace paddle {
 namespace distributed {
 class GraphShard {
  public:
-  // static int bucket_low_bound;
-  // static int gcd(int s, int t) {
-  //   if (s % t == 0) return t;
-  //   return gcd(t, s % t);
-  // }
   size_t get_size();
   GraphShard() {}
-  GraphShard(int shard_num) {
-    this->shard_num = shard_num;
-    // bucket_size = init_bucket_size(shard_num);
-    // bucket.resize(bucket_size);
-  }
+  GraphShard(int shard_num) { this->shard_num = shard_num; }
   std::vector<Node *> &get_bucket() { return bucket; }
   std::vector<Node *> get_batch(int start, int end, int step);
-  // int init_bucket_size(int shard_num) {
-  //   for (int i = bucket_low_bound;; i++) {
-  //     if (gcd(i, shard_num) == 1) return i;
-  //   }
-  //   return -1;
-  // }
   std::vector<uint64_t> get_ids_by_range(int start, int end) {
     std::vector<uint64_t> res;
     for (int i = start; i < end && i < bucket.size(); i++) {
@@ -64,7 +49,6 @@ class GraphShard {
   FeatureNode *add_feature_node(uint64_t id);
   Node *find_node(uint64_t id);
   void add_neighboor(uint64_t id, uint64_t dst_id, float weight);
-  // std::unordered_map<uint64_t, std::list<GraphNode *>::iterator>
   std::unordered_map<uint64_t, int> get_node_location() {
     return node_location;
   }
@@ -131,7 +115,7 @@ class GraphTable : public SparseTable {
  protected:
   std::vector<GraphShard> shards;
   size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num;
-  const int task_pool_size_ = 11;
+  const int task_pool_size_ = 24;
   const int random_sample_nodes_ranges = 3;
 
   std::vector<std::string> feat_name;
diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc
index 2e8c257b6aad47..718fce9950719f 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -125,34 +125,37 @@ void ProcessALine(const std::vector<std::string>& columns, const Meta& meta,
 
 int64_t SaveToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
                    const int mode) {
-  int64_t not_save_num = 0;
-  for (auto value : block->values_) {
-    if (mode == SaveMode::delta && !value.second->need_save_) {
-      not_save_num++;
-      continue;
-    }
-
-    auto* vs = value.second->data_.data();
-    std::stringstream ss;
-    auto id = value.first;
-    ss << id << "\t" << value.second->count_ << "\t"
-       << value.second->unseen_days_ << "\t" << value.second->is_entry_ << "\t";
-
-    for (int i = 0; i < block->value_length_; i++) {
-      ss << vs[i];
-      ss << ",";
-    }
+  int64_t save_num = 0;
+  for (auto& table : block->values_) {
+    for (auto& value : table) {
+      if (mode == SaveMode::delta && !value.second->need_save_) {
+        continue;
+      }
+      save_num += 1;
+
+      auto* vs = value.second->data_.data();
+      std::stringstream ss;
+      auto id = value.first;
+      ss << id << "\t" << value.second->count_ << "\t"
+         << value.second->unseen_days_ << "\t" << value.second->is_entry_
+         << "\t";
+
+      for (int i = 0; i < block->value_length_; i++) {
+        ss << vs[i];
+        ss << ",";
+      }
 
-    ss << "\n";
+      ss << "\n";
 
-    os->write(ss.str().c_str(), sizeof(char) * ss.str().size());
+      os->write(ss.str().c_str(), sizeof(char) * ss.str().size());
 
-    if (mode == SaveMode::base || mode == SaveMode::delta) {
-      value.second->need_save_ = false;
+      if (mode == SaveMode::base || mode == SaveMode::delta) {
+        value.second->need_save_ = false;
+      }
     }
   }
 
-  return block->values_.size() - not_save_num;
+  return save_num;
 }
 
 int64_t LoadFromText(const std::string& valuepath, const std::string& metapath,
@@ -183,7 +186,7 @@ int64_t LoadFromText(const std::string& valuepath, const std::string& metapath,
 
     block->Init(id, false);
 
-    auto value_instant = block->GetValue(id);
+    VALUE* value_instant = block->GetValue(id);
     if (values.size() == 5) {
       value_instant->count_ = std::stoi(values[1]);
       value_instant->unseen_days_ = std::stoi(values[2]);
@@ -373,8 +376,10 @@ std::pair<int64_t, int64_t> CommonSparseTable::print_table_stat() {
   int64_t feasign_size = 0;
   int64_t mf_size = 0;
 
-  for (auto& value : shard_values_) {
-    feasign_size += value->values_.size();
+  for (auto& shard : shard_values_) {
+    for (auto& table : shard->values_) {
+      feasign_size += table.size();
+    }
   }
 
   return {feasign_size, mf_size};
diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h
index 68d252661edd53..5c10fca98cda4d 100644
--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h
@@ -26,8 +26,10 @@
 #include <vector>
 #include "gflags/gflags.h"
 
+#include "butil/object_pool.h"
 #include "paddle/fluid/distributed/common/utils.h"
 #include "paddle/fluid/distributed/table/depends/initializers.h"
+#include "paddle/fluid/distributed/thirdparty/round_robin.h"
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/rw_lock.h"
@@ -47,6 +49,10 @@ namespace distributed {
 
 enum Mode { training, infer };
 
+static const int SPARSE_SHARD_BUCKET_NUM_BITS = 6;
+static const size_t SPARSE_SHARD_BUCKET_NUM = (size_t)1
+                                              << SPARSE_SHARD_BUCKET_NUM_BITS;
+
 struct VALUE {
   explicit VALUE(size_t length)
       : length_(length),
@@ -66,11 +72,11 @@ struct VALUE {
   bool is_entry_;    // whether knock-in
 };
 
-inline bool count_entry(std::shared_ptr<VALUE> value, int threshold) {
+inline bool count_entry(VALUE *value, int threshold) {
   return value->count_ >= threshold;
 }
 
-inline bool probility_entry(std::shared_ptr<VALUE> value, float threshold) {
+inline bool probility_entry(VALUE *value, float threshold) {
   UniformInitializer uniform = UniformInitializer({"uniform", "0", "0", "1"});
   return uniform.GetValue() >= threshold;
 }
@@ -145,7 +151,7 @@ class ValueBlock {
                            const std::vector<int> &value_dims) {
     auto pts = std::vector<float *>();
     pts.reserve(value_names.size());
-    auto &values = values_.at(id);
+    auto values = GetValue(id);
     for (int i = 0; i < static_cast<int>(value_names.size()); i++) {
       PADDLE_ENFORCE_EQ(
           value_dims[i], value_dims_[i],
@@ -159,35 +165,48 @@ class ValueBlock {
   // pull
   float *Init(const uint64_t &id, const bool with_update = true,
               const int counter = 1) {
-    if (!Has(id)) {
-      values_[id] = std::make_shared<VALUE>(value_length_);
-    }
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
+
+    auto &table = values_[bucket];
+    auto res = table.find(id);
 
-    auto &value = values_.at(id);
+    VALUE *value = nullptr;
+    if (res == table.end()) {
+      value = butil::get_object<VALUE>(value_length_);
+
+      table[id] = value;
+
+    } else {
+      value = res->second;
+    }
 
     if (with_update) {
       AttrUpdate(value, counter);
     }
-
     return value->data_.data();
   }
 
   VALUE *InitGet(const uint64_t &id, const bool with_update = true,
                  const int counter = 1) {
-    if (!Has(id)) {
-      values_[id] = std::make_shared<VALUE>(value_length_);
-    }
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
 
-    auto &value = values_.at(id);
+    auto &table = values_[bucket];
+    auto res = table.find(id);
 
-    if (with_update) {
-      AttrUpdate(value, counter);
+    VALUE *value = nullptr;
+    if (res == table.end()) {
+      value = butil::get_object<VALUE>(value_length_);
+      // value = _alloc.acquire(value_length_);
+      table[id] = value;
+    } else {
+      value = (VALUE *)(void *)(res->second);
     }
-
-    return value.get();
+    return value;
   }
 
-  void AttrUpdate(std::shared_ptr<VALUE> value, const int counter) {
+  void AttrUpdate(VALUE *value, const int counter) {
     // update state
     value->unseen_days_ = 0;
     value->count_ += counter;
@@ -211,42 +230,73 @@ class ValueBlock {
 
   // dont jude if (has(id))
   float *Get(const uint64_t &id) {
-    auto &value = values_.at(id);
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
+    auto &table = values_[bucket];
+
+    // auto &value = table.at(id);
+    // return value->data_.data();
+    auto res = table.find(id);
+    VALUE *value = res->second;
     return value->data_.data();
   }
 
   // for load, to reset count, unseen_days
-  std::shared_ptr<VALUE> GetValue(const uint64_t &id) { return values_.at(id); }
+  VALUE *GetValue(const uint64_t &id) {
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
+
+    auto &table = values_[bucket];
+    auto res = table.find(id);
+    return res->second;
+  }
 
   bool GetEntry(const uint64_t &id) {
-    auto &value = values_.at(id);
+    auto value = GetValue(id);
     return value->is_entry_;
   }
 
   void SetEntry(const uint64_t &id, const bool state) {
-    auto &value = values_.at(id);
+    auto value = GetValue(id);
     value->is_entry_ = state;
   }
 
   void Shrink(const int threshold) {
-    for (auto iter = values_.begin(); iter != values_.end();) {
-      auto &value = iter->second;
-      value->unseen_days_++;
-      if (value->unseen_days_ >= threshold) {
-        iter = values_.erase(iter);
-      } else {
-        ++iter;
+    for (auto &table : values_) {
+      for (auto iter = table.begin(); iter != table.end();) {
+        // VALUE* value = (VALUE*)(void*)(iter->second);
+        VALUE *value = iter->second;
+        value->unseen_days_++;
+        if (value->unseen_days_ >= threshold) {
+          butil::return_object(iter->second);
+          //_alloc.release(iter->second);
+          //_alloc.release(value);
+          iter = table.erase(iter);
+        } else {
+          ++iter;
+        }
       }
     }
     return;
   }
 
   float GetThreshold() { return threshold_; }
+  size_t compute_bucket(size_t hash) {
+    if (SPARSE_SHARD_BUCKET_NUM == 1) {
+      return 0;
+    } else {
+      return hash >> (sizeof(size_t) * 8 - SPARSE_SHARD_BUCKET_NUM_BITS);
+    }
+  }
 
  private:
   bool Has(const uint64_t id) {
-    auto got = values_.find(id);
-    if (got == values_.end()) {
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
+    auto &table = values_[bucket];
+
+    auto got = table.find(id);
+    if (got == table.end()) {
       return false;
     } else {
       return true;
@@ -254,8 +304,9 @@ class ValueBlock {
   }
 
  public:
-  std::unordered_map<uint64_t, std::shared_ptr<VALUE>> values_;
+  robin_hood::unordered_map<uint64_t, VALUE *> values_[SPARSE_SHARD_BUCKET_NUM];
   size_t value_length_ = 0;
+  std::hash<uint64_t> _hasher;
 
  private:
   const std::vector<std::string> &value_names_;
@@ -263,7 +314,7 @@ class ValueBlock {
   const std::vector<int> &value_offsets_;
   const std::unordered_map<std::string, int> &value_idx_;
 
-  std::function<bool(std::shared_ptr<VALUE>)> entry_func_;
+  std::function<bool(VALUE *)> entry_func_;
   std::vector<std::shared_ptr<Initializer>> initializers_;
   float threshold_;
 };
diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt
index b756c740ac764c..af87e1b6cc61d1 100644
--- a/paddle/fluid/distributed/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/test/CMakeLists.txt
@@ -1,8 +1,10 @@
 set_source_files_properties(table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
+cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor
+ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS})
 
 set_source_files_properties(dense_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
+cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table
+tensor_accessor ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS})
 
 set_source_files_properties(barrier_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_test(barrier_table_test SRCS barrier_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS})
diff --git a/paddle/fluid/distributed/thirdparty/round_robin.h b/paddle/fluid/distributed/thirdparty/round_robin.h
new file mode 100644
index 00000000000000..f5075b4545af04
--- /dev/null
+++ b/paddle/fluid/distributed/thirdparty/round_robin.h
@@ -0,0 +1,2685 @@
+//                 ______  _____                 ______                _________
+//  ______________ ___  /_ ___(_)_______         ___  /_ ______ ______ ______  /
+//  __  ___/_  __ \__  __ \__  / __  __ \        __  __ \_  __ \_  __ \_  __  /
+//  _  /    / /_/ /_  /_/ /_  /  _  / / /        _  / / // /_/ // /_/ // /_/ /
+//  /_/     \____/ /_.___/ /_/   /_/ /_/ ________/_/ /_/ \____/ \____/ \__,_/
+//                                      _/_____/
+//
+// Fast & memory efficient hashtable based on robin hood hashing for
+// C++11/14/17/20
+// https://github.com/martinus/robin-hood-hashing
+//
+// Licensed under the MIT License <http://opensource.org/licenses/MIT>.
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2021 Martin Ankerl <http://martin.ankerl.com>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#ifndef ROBIN_HOOD_H_INCLUDED
+#define ROBIN_HOOD_H_INCLUDED
+
+// see https://semver.org/
+#define ROBIN_HOOD_VERSION_MAJOR 3  // for incompatible API changes
+#define ROBIN_HOOD_VERSION_MINOR \
+  11  // for adding functionality in a backwards-compatible manner
+#define ROBIN_HOOD_VERSION_PATCH 1  // for backwards-compatible bug fixes
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <memory>  // only to support hash of smart pointers
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <utility>
+#if __cplusplus >= 201703L
+#include <string_view>
+#endif
+
+// #define ROBIN_HOOD_LOG_ENABLED
+#ifdef ROBIN_HOOD_LOG_ENABLED
+#include <iostream>
+#define ROBIN_HOOD_LOG(...)                                           \
+  std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ \
+            << std::endl;
+#else
+#define ROBIN_HOOD_LOG(x)
+#endif
+
+// #define ROBIN_HOOD_TRACE_ENABLED
+#ifdef ROBIN_HOOD_TRACE_ENABLED
+#include <iostream>
+#define ROBIN_HOOD_TRACE(...)                                         \
+  std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ \
+            << std::endl;
+#else
+#define ROBIN_HOOD_TRACE(x)
+#endif
+
+// #define ROBIN_HOOD_COUNT_ENABLED
+#ifdef ROBIN_HOOD_COUNT_ENABLED
+#include <iostream>
+#define ROBIN_HOOD_COUNT(x) ++counts().x;
+namespace robin_hood {
+struct Counts {
+  uint64_t shiftUp{};
+  uint64_t shiftDown{};
+};
+inline std::ostream &operator<<(std::ostream &os, Counts const &c) {
+  return os << c.shiftUp << " shiftUp" << std::endl
+            << c.shiftDown << " shiftDown" << std::endl;
+}
+
+static Counts &counts() {
+  static Counts counts{};
+  return counts;
+}
+}  // namespace robin_hood
+#else
+#define ROBIN_HOOD_COUNT(x)
+#endif
+
+// all non-argument macros should use this facility. See
+// https://www.fluentcpp.com/2019/05/28/better-macros-better-flags/
+#define ROBIN_HOOD(x) ROBIN_HOOD_PRIVATE_DEFINITION_##x()
+
+// mark unused members with this macro
+#define ROBIN_HOOD_UNUSED(identifier)
+
+// bitness
+#if SIZE_MAX == UINT32_MAX
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 32
+#elif SIZE_MAX == UINT64_MAX
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 64
+#else
+#error Unsupported bitness
+#endif
+
+// endianess
+#ifdef _MSC_VER
+#define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() 1
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() 0
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() \
+  (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() \
+  (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#endif
+
+// inline
+#ifdef _MSC_VER
+#define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __declspec(noinline)
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __attribute__((noinline))
+#endif
+
+// exceptions
+#if !defined(__cpp_exceptions) && !defined(__EXCEPTIONS) && !defined(_CPPUNWIND)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 0
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 1
+#endif
+
+// count leading/trailing bits
+#if !defined(ROBIN_HOOD_DISABLE_INTRINSICS)
+#ifdef _MSC_VER
+#if ROBIN_HOOD(BITNESS) == 32
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward64
+#endif
+#include <intrin.h>
+#pragma intrinsic(ROBIN_HOOD(BITSCANFORWARD))
+#define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x)                                   \
+  [](size_t mask) noexcept->int {                                             \
+    unsigned long index;                                                      \
+    return ROBIN_HOOD(BITSCANFORWARD)(&index, mask) ? static_cast<int>(index) \
+                                                    : ROBIN_HOOD(BITNESS);    \
+  }                                                                           \
+  (x)
+#else
+#if ROBIN_HOOD(BITNESS) == 32
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzl
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzl
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzll
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzll
+#endif
+#define ROBIN_HOOD_COUNT_LEADING_ZEROES(x) \
+  ((x) ? ROBIN_HOOD(CLZ)(x) : ROBIN_HOOD(BITNESS))
+#define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x) \
+  ((x) ? ROBIN_HOOD(CTZ)(x) : ROBIN_HOOD(BITNESS))
+#endif
+#endif
+
+// fallthrough
+#ifndef __has_cpp_attribute  // For backwards compatibility
+#define __has_cpp_attribute(x) 0
+#endif
+#if __has_cpp_attribute(clang::fallthrough)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[clang::fallthrough]]
+#elif __has_cpp_attribute(gnu::fallthrough)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[gnu::fallthrough]]
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH()
+#endif
+
+// likely/unlikely
+#ifdef _MSC_VER
+#define ROBIN_HOOD_LIKELY(condition) condition
+#define ROBIN_HOOD_UNLIKELY(condition) condition
+#else
+#define ROBIN_HOOD_LIKELY(condition) __builtin_expect(condition, 1)
+#define ROBIN_HOOD_UNLIKELY(condition) __builtin_expect(condition, 0)
+#endif
+
+// detect if native wchar_t type is availiable in MSVC
+#ifdef _MSC_VER
+#ifdef _NATIVE_WCHAR_T_DEFINED
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 0
+#endif
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1
+#endif
+
+// detect if MSVC supports the pair(std::piecewise_construct_t,...) consructor
+// being constexpr
+#ifdef _MSC_VER
+#if _MSC_VER <= 1900
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 1
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0
+#endif
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0
+#endif
+
+// workaround missing "is_trivially_copyable" in g++ < 5.0
+// See https://stackoverflow.com/a/31798726/48181
+#if defined(__GNUC__) && __GNUC__ < 5
+#define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__)
+#else
+#define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) \
+  std::is_trivially_copyable<__VA_ARGS__>::value
+#endif
+
+// helpers for C++ versions, see
+// https://gcc.gnu.org/onlinedocs/cpp/Standard-Predefined-Macros.html
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX() __cplusplus
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX98() 199711L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX11() 201103L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX14() 201402L
+#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX17() 201703L
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17)
+#define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD() [[nodiscard]]
+#else
+#define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD()
+#endif
+
+namespace robin_hood {
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14)
+#define ROBIN_HOOD_STD std
+#else
+
+// c++11 compatibility layer
+namespace ROBIN_HOOD_STD {
+template <class T>
+struct alignment_of
+    : std::integral_constant<
+          std::size_t, alignof(typename std::remove_all_extents<T>::type)> {};
+
+template <class T, T... Ints>
+class integer_sequence {
+ public:
+  using value_type = T;
+  static_assert(std::is_integral<value_type>::value, "not integral type");
+  static constexpr std::size_t size() noexcept { return sizeof...(Ints); }
+};
+template <std::size_t... Inds>
+using index_sequence = integer_sequence<std::size_t, Inds...>;
+
+namespace detail_ {
+template <class T, T Begin, T End, bool>
+struct IntSeqImpl {
+  using TValue = T;
+  static_assert(std::is_integral<TValue>::value, "not integral type");
+  static_assert(Begin >= 0 && Begin < End,
+                "unexpected argument (Begin<0 || Begin<=End)");
+
+  template <class, class>
+  struct IntSeqCombiner;
+
+  template <TValue... Inds0, TValue... Inds1>
+  struct IntSeqCombiner<integer_sequence<TValue, Inds0...>,
+                        integer_sequence<TValue, Inds1...>> {
+    using TResult = integer_sequence<TValue, Inds0..., Inds1...>;
+  };
+
+  using TResult = typename IntSeqCombiner<
+      typename IntSeqImpl<TValue, Begin, Begin + (End - Begin) / 2,
+                          (End - Begin) / 2 == 1>::TResult,
+      typename IntSeqImpl<TValue, Begin + (End - Begin) / 2, End,
+                          (End - Begin + 1) / 2 == 1>::TResult>::TResult;
+};
+
+template <class T, T Begin>
+struct IntSeqImpl<T, Begin, Begin, false> {
+  using TValue = T;
+  static_assert(std::is_integral<TValue>::value, "not integral type");
+  static_assert(Begin >= 0, "unexpected argument (Begin<0)");
+  using TResult = integer_sequence<TValue>;
+};
+
+template <class T, T Begin, T End>
+struct IntSeqImpl<T, Begin, End, true> {
+  using TValue = T;
+  static_assert(std::is_integral<TValue>::value, "not integral type");
+  static_assert(Begin >= 0, "unexpected argument (Begin<0)");
+  using TResult = integer_sequence<TValue, Begin>;
+};
+}  // namespace detail_
+
+template <class T, T N>
+using make_integer_sequence =
+    typename detail_::IntSeqImpl<T, 0, N, (N - 0) == 1>::TResult;
+
+template <std::size_t N>
+using make_index_sequence = make_integer_sequence<std::size_t, N>;
+
+template <class... T>
+using index_sequence_for = make_index_sequence<sizeof...(T)>;
+
+}  // namespace ROBIN_HOOD_STD
+
+#endif
+
+namespace detail {
+
+// make sure we static_cast to the correct type for hash_int
+#if ROBIN_HOOD(BITNESS) == 64
+using SizeT = uint64_t;
+#else
+using SizeT = uint32_t;
+#endif
+
+template <typename T>
+T rotr(T x, unsigned k) {
+  return (x >> k) | (x << (8U * sizeof(T) - k));
+}
+
+// This cast gets rid of warnings like "cast from 'uint8_t*' {aka 'unsigned
+// char*'} to
+// 'uint64_t*' {aka 'long unsigned int*'} increases required alignment of target
+// type". Use with
+// care!
+template <typename T>
+inline T reinterpret_cast_no_cast_align_warning(void *ptr) noexcept {
+  return reinterpret_cast<T>(ptr);
+}
+
+template <typename T>
+inline T reinterpret_cast_no_cast_align_warning(void const *ptr) noexcept {
+  return reinterpret_cast<T>(ptr);
+}
+
+// make sure this is not inlined as it is slow and dramatically enlarges code,
+// thus making other
+// inlinings more difficult. Throws are also generally the slow path.
+template <typename E, typename... Args>
+[[noreturn]] ROBIN_HOOD(NOINLINE)
+#if ROBIN_HOOD(HAS_EXCEPTIONS)
+    void doThrow(Args &&... args) {
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay)
+  throw E(std::forward<Args>(args)...);
+}
+#else
+    void doThrow(Args &&... ROBIN_HOOD_UNUSED(args) /*unused*/) {
+  abort();
+}
+#endif
+
+template <typename E, typename T, typename... Args>
+T *assertNotNull(T *t, Args &&... args) {
+  if (ROBIN_HOOD_UNLIKELY(nullptr == t)) {
+    doThrow<E>(std::forward<Args>(args)...);
+  }
+  return t;
+}
+
+template <typename T>
+inline T unaligned_load(void const *ptr) noexcept {
+  // using memcpy so we don't get into unaligned load problems.
+  // compiler should optimize this very well anyways.
+  T t;
+  std::memcpy(&t, ptr, sizeof(T));
+  return t;
+}
+
+// Allocates bulks of memory for objects of type T. This deallocates the memory
+// in the destructor,
+// and keeps a linked list of the allocated memory around. Overhead per
+// allocation is the size of a
+// pointer.
+template <typename T, size_t MinNumAllocs = 4, size_t MaxNumAllocs = 256>
+class BulkPoolAllocator {
+ public:
+  BulkPoolAllocator() noexcept = default;
+
+  // does not copy anything, just creates a new allocator.
+  BulkPoolAllocator(const BulkPoolAllocator &ROBIN_HOOD_UNUSED(
+      o) /*unused*/) noexcept : mHead(nullptr),
+                                mListForFree(nullptr) {}
+
+  BulkPoolAllocator(BulkPoolAllocator &&o) noexcept
+      : mHead(o.mHead),
+        mListForFree(o.mListForFree) {
+    o.mListForFree = nullptr;
+    o.mHead = nullptr;
+  }
+
+  BulkPoolAllocator &operator=(BulkPoolAllocator &&o) noexcept {
+    reset();
+    mHead = o.mHead;
+    mListForFree = o.mListForFree;
+    o.mListForFree = nullptr;
+    o.mHead = nullptr;
+    return *this;
+  }
+
+  BulkPoolAllocator &
+  // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp)
+  operator=(const BulkPoolAllocator &ROBIN_HOOD_UNUSED(o) /*unused*/) noexcept {
+    // does not do anything
+    return *this;
+  }
+
+  ~BulkPoolAllocator() noexcept { reset(); }
+
+  // Deallocates all allocated memory.
+  void reset() noexcept {
+    while (mListForFree) {
+      T *tmp = *mListForFree;
+      ROBIN_HOOD_LOG("std::free")
+      std::free(mListForFree);
+      mListForFree = reinterpret_cast_no_cast_align_warning<T **>(tmp);
+    }
+    mHead = nullptr;
+  }
+
+  // allocates, but does NOT initialize. Use in-place new constructor, e.g.
+  //   T* obj = pool.allocate();
+  //   ::new (static_cast<void*>(obj)) T();
+  T *allocate() {
+    T *tmp = mHead;
+    if (!tmp) {
+      tmp = performAllocation();
+    }
+
+    mHead = *reinterpret_cast_no_cast_align_warning<T **>(tmp);
+    return tmp;
+  }
+
+  // does not actually deallocate but puts it in store.
+  // make sure you have already called the destructor! e.g. with
+  //  obj->~T();
+  //  pool.deallocate(obj);
+  void deallocate(T *obj) noexcept {
+    *reinterpret_cast_no_cast_align_warning<T **>(obj) = mHead;
+    mHead = obj;
+  }
+
+  // Adds an already allocated block of memory to the allocator. This allocator
+  // is from now on
+  // responsible for freeing the data (with free()). If the provided data is not
+  // large enough to
+  // make use of, it is immediately freed. Otherwise it is reused and freed in
+  // the destructor.
+  void addOrFree(void *ptr, const size_t numBytes) noexcept {
+    // calculate number of available elements in ptr
+    if (numBytes < ALIGNMENT + ALIGNED_SIZE) {
+      // not enough data for at least one element. Free and return.
+      ROBIN_HOOD_LOG("std::free")
+      std::free(ptr);
+    } else {
+      ROBIN_HOOD_LOG("add to buffer")
+      add(ptr, numBytes);
+    }
+  }
+
+  void swap(BulkPoolAllocator<T, MinNumAllocs, MaxNumAllocs> &other) noexcept {
+    using std::swap;
+    swap(mHead, other.mHead);
+    swap(mListForFree, other.mListForFree);
+  }
+
+ private:
+  // iterates the list of allocated memory to calculate how many to alloc next.
+  // Recalculating this each time saves us a size_t member.
+  // This ignores the fact that memory blocks might have been added manually
+  // with addOrFree. In
+  // practice, this should not matter much.
+  ROBIN_HOOD(NODISCARD) size_t calcNumElementsToAlloc() const noexcept {
+    auto tmp = mListForFree;
+    size_t numAllocs = MinNumAllocs;
+
+    while (numAllocs * 2 <= MaxNumAllocs && tmp) {
+      auto x = reinterpret_cast<T ***>(tmp);
+      tmp = *x;
+      numAllocs *= 2;
+    }
+
+    return numAllocs;
+  }
+
+  // WARNING: Underflow if numBytes < ALIGNMENT! This is guarded in addOrFree().
+  void add(void *ptr, const size_t numBytes) noexcept {
+    const size_t numElements = (numBytes - ALIGNMENT) / ALIGNED_SIZE;
+
+    auto data = reinterpret_cast<T **>(ptr);
+
+    // link free list
+    auto x = reinterpret_cast<T ***>(data);
+    *x = mListForFree;
+    mListForFree = data;
+
+    // create linked list for newly allocated data
+    auto *const headT = reinterpret_cast_no_cast_align_warning<T *>(
+        reinterpret_cast<char *>(ptr) + ALIGNMENT);
+
+    auto *const head = reinterpret_cast<char *>(headT);
+
+    // Visual Studio compiler automatically unrolls this loop, which is pretty
+    // cool
+    for (size_t i = 0; i < numElements; ++i) {
+      *reinterpret_cast_no_cast_align_warning<char **>(
+          head + i * ALIGNED_SIZE) = head + (i + 1) * ALIGNED_SIZE;
+    }
+
+    // last one points to 0
+    *reinterpret_cast_no_cast_align_warning<T **>(
+        head + (numElements - 1) * ALIGNED_SIZE) = mHead;
+    mHead = headT;
+  }
+
+  // Called when no memory is available (mHead == 0).
+  // Don't inline this slow path.
+  ROBIN_HOOD(NOINLINE) T *performAllocation() {
+    size_t const numElementsToAlloc = calcNumElementsToAlloc();
+
+    // alloc new memory: [prev |T, T, ... T]
+    size_t const bytes = ALIGNMENT + ALIGNED_SIZE * numElementsToAlloc;
+    ROBIN_HOOD_LOG("std::malloc " << bytes << " = " << ALIGNMENT << " + "
+                                  << ALIGNED_SIZE << " * "
+                                  << numElementsToAlloc)
+    add(assertNotNull<std::bad_alloc>(std::malloc(bytes)), bytes);
+    return mHead;
+  }
+
+// enforce byte alignment of the T's
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14)
+  static constexpr size_t ALIGNMENT =
+      (std::max)(std::alignment_of<T>::value, std::alignment_of<T *>::value);
+#else
+  static const size_t ALIGNMENT =
+      (ROBIN_HOOD_STD::alignment_of<T>::value >
+       ROBIN_HOOD_STD::alignment_of<T *>::value)
+          ? ROBIN_HOOD_STD::alignment_of<T>::value
+          : +ROBIN_HOOD_STD::alignment_of<T *>::value;  // the + is for
+                                                        // walkarround
+#endif
+
+  static constexpr size_t ALIGNED_SIZE =
+      ((sizeof(T) - 1) / ALIGNMENT + 1) * ALIGNMENT;
+
+  static_assert(MinNumAllocs >= 1, "MinNumAllocs");
+  static_assert(MaxNumAllocs >= MinNumAllocs, "MaxNumAllocs");
+  static_assert(ALIGNED_SIZE >= sizeof(T *), "ALIGNED_SIZE");
+  static_assert(0 == (ALIGNED_SIZE % sizeof(T *)), "ALIGNED_SIZE mod");
+  static_assert(ALIGNMENT >= sizeof(T *), "ALIGNMENT");
+
+  T *mHead{nullptr};
+  T **mListForFree{nullptr};
+};
+
+template <typename T, size_t MinSize, size_t MaxSize, bool IsFlat>
+struct NodeAllocator;
+
+// dummy allocator that does nothing
+template <typename T, size_t MinSize, size_t MaxSize>
+struct NodeAllocator<T, MinSize, MaxSize, true> {
+  // we are not using the data, so just free it.
+  void addOrFree(void *ptr,
+                 size_t ROBIN_HOOD_UNUSED(numBytes) /*unused*/) noexcept {
+    ROBIN_HOOD_LOG("std::free")
+    std::free(ptr);
+  }
+};
+
+template <typename T, size_t MinSize, size_t MaxSize>
+struct NodeAllocator<T, MinSize, MaxSize, false>
+    : public BulkPoolAllocator<T, MinSize, MaxSize> {};
+
+// c++14 doesn't have is_nothrow_swappable, and clang++ 6.0.1 doesn't like it
+// either, so I'm making
+// my own here.
+namespace swappable {
+#if ROBIN_HOOD(CXX) < ROBIN_HOOD(CXX17)
+using std::swap;
+template <typename T>
+struct nothrow {
+  static const bool value =
+      noexcept(swap(std::declval<T &>(), std::declval<T &>()));
+};
+#else
+template <typename T>
+struct nothrow {
+  static const bool value = std::is_nothrow_swappable<T>::value;
+};
+#endif
+}  // namespace swappable
+
+}  // namespace detail
+
+struct is_transparent_tag {};
+
+// A custom pair implementation is used in the map because std::pair is not
+// is_trivially_copyable,
+// which means it would  not be allowed to be used in std::memcpy. This struct
+// is copyable, which is
+// also tested.
+template <typename T1, typename T2>
+struct pair {
+  using first_type = T1;
+  using second_type = T2;
+
+  template <typename U1 = T1, typename U2 = T2,
+            typename = typename std::enable_if<
+                std::is_default_constructible<U1>::value &&
+                std::is_default_constructible<U2>::value>::type>
+  constexpr pair() noexcept(noexcept(U1()) && noexcept(U2()))
+      : first(), second() {}
+
+  // pair constructors are explicit so we don't accidentally call this ctor when
+  // we don't have to.
+  explicit constexpr pair(std::pair<T1, T2> const &o) noexcept(
+      noexcept(T1(std::declval<T1 const &>())) &&
+      noexcept(T2(std::declval<T2 const &>())))
+      : first(o.first), second(o.second) {}
+
+  // pair constructors are explicit so we don't accidentally call this ctor when
+  // we don't have to.
+  explicit constexpr pair(std::pair<T1, T2> &&o) noexcept(
+      noexcept(T1(std::move(std::declval<T1 &&>()))) &&
+      noexcept(T2(std::move(std::declval<T2 &&>()))))
+      : first(std::move(o.first)), second(std::move(o.second)) {}
+
+  constexpr pair(T1 &&a, T2 &&b) noexcept(
+      noexcept(T1(std::move(std::declval<T1 &&>()))) &&
+      noexcept(T2(std::move(std::declval<T2 &&>()))))
+      : first(std::move(a)), second(std::move(b)) {}
+
+  template <typename U1, typename U2>
+  constexpr pair(U1 &&a, U2 &&b) noexcept(
+      noexcept(T1(std::forward<U1>(std::declval<U1 &&>()))) &&
+      noexcept(T2(std::forward<U2>(std::declval<U2 &&>()))))
+      : first(std::forward<U1>(a)), second(std::forward<U2>(b)) {}
+
+  template <typename... U1, typename... U2>
+// MSVC 2015 produces error "C2476: ‘constexpr’ constructor does not initialize
+// all members"
+// if this constructor is constexpr
+#if !ROBIN_HOOD(BROKEN_CONSTEXPR)
+  constexpr
+#endif
+      pair(std::piecewise_construct_t /*unused*/, std::tuple<U1...> a,
+           std::tuple<U2...>
+               b) noexcept(noexcept(pair(std::declval<std::tuple<U1...> &>(),
+                                         std::declval<std::tuple<U2...> &>(),
+                                         ROBIN_HOOD_STD::index_sequence_for<
+                                             U1...>(),
+                                         ROBIN_HOOD_STD::index_sequence_for<
+                                             U2...>())))
+      : pair(a, b, ROBIN_HOOD_STD::index_sequence_for<U1...>(),
+             ROBIN_HOOD_STD::index_sequence_for<U2...>()) {
+  }
+
+  // constructor called from the std::piecewise_construct_t ctor
+  template <typename... U1, size_t... I1, typename... U2, size_t... I2>
+  pair(
+      std::tuple<U1...> &a, std::tuple<U2...> &b,
+      ROBIN_HOOD_STD::index_sequence<I1...> /*unused*/,
+      ROBIN_HOOD_STD::index_sequence<
+          I2...> /*unused*/) noexcept(noexcept(T1(std::
+                                                      forward<U1>(std::get<I1>(
+                                                          std::declval<
+                                                              std::tuple<U1...>
+                                                                  &>()))...)) &&
+                                      noexcept(T2(std::forward<U2>(std::get<I2>(
+                                          std::declval<
+                                              std::tuple<U2...> &>()))...)))
+      : first(std::forward<U1>(std::get<I1>(a))...),
+        second(std::forward<U2>(std::get<I2>(b))...) {
+    // make visual studio compiler happy about warning about unused a & b.
+    // Visual studio's pair implementation disables warning 4100.
+    (void)a;
+    (void)b;
+  }
+
+  void swap(pair<T1, T2> &o) noexcept((detail::swappable::nothrow<T1>::value) &&
+                                      (detail::swappable::nothrow<T2>::value)) {
+    using std::swap;
+    swap(first, o.first);
+    swap(second, o.second);
+  }
+
+  T1 first;   // NOLINT(misc-non-private-member-variables-in-classes)
+  T2 second;  // NOLINT(misc-non-private-member-variables-in-classes)
+};
+
+template <typename A, typename B>
+inline void swap(pair<A, B> &a, pair<A, B> &b) noexcept(
+    noexcept(std::declval<pair<A, B> &>().swap(std::declval<pair<A, B> &>()))) {
+  a.swap(b);
+}
+
+template <typename A, typename B>
+inline constexpr bool operator==(pair<A, B> const &x, pair<A, B> const &y) {
+  return (x.first == y.first) && (x.second == y.second);
+}
+template <typename A, typename B>
+inline constexpr bool operator!=(pair<A, B> const &x, pair<A, B> const &y) {
+  return !(x == y);
+}
+template <typename A, typename B>
+inline constexpr bool
+operator<(pair<A, B> const &x, pair<A, B> const &y) noexcept(
+    noexcept(std::declval<A const &>() < std::declval<A const &>()) &&
+    noexcept(std::declval<B const &>() < std::declval<B const &>())) {
+  return x.first < y.first || (!(y.first < x.first) && x.second < y.second);
+}
+template <typename A, typename B>
+inline constexpr bool operator>(pair<A, B> const &x, pair<A, B> const &y) {
+  return y < x;
+}
+template <typename A, typename B>
+inline constexpr bool operator<=(pair<A, B> const &x, pair<A, B> const &y) {
+  return !(x > y);
+}
+template <typename A, typename B>
+inline constexpr bool operator>=(pair<A, B> const &x, pair<A, B> const &y) {
+  return !(x < y);
+}
+
+inline size_t hash_bytes(void const *ptr, size_t len) noexcept {
+  static constexpr uint64_t m = UINT64_C(0xc6a4a7935bd1e995);
+  static constexpr uint64_t seed = UINT64_C(0xe17a1465);
+  static constexpr unsigned int r = 47;
+
+  auto const *const data64 = static_cast<uint64_t const *>(ptr);
+  uint64_t h = seed ^ (len * m);
+
+  size_t const n_blocks = len / 8;
+  for (size_t i = 0; i < n_blocks; ++i) {
+    auto k = detail::unaligned_load<uint64_t>(data64 + i);
+
+    k *= m;
+    k ^= k >> r;
+    k *= m;
+
+    h ^= k;
+    h *= m;
+  }
+
+  auto const *const data8 =
+      reinterpret_cast<uint8_t const *>(data64 + n_blocks);
+  switch (len & 7U) {
+    case 7:
+      h ^= static_cast<uint64_t>(data8[6]) << 48U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 6:
+      h ^= static_cast<uint64_t>(data8[5]) << 40U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 5:
+      h ^= static_cast<uint64_t>(data8[4]) << 32U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 4:
+      h ^= static_cast<uint64_t>(data8[3]) << 24U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 3:
+      h ^= static_cast<uint64_t>(data8[2]) << 16U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 2:
+      h ^= static_cast<uint64_t>(data8[1]) << 8U;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    case 1:
+      h ^= static_cast<uint64_t>(data8[0]);
+      h *= m;
+      ROBIN_HOOD(FALLTHROUGH);  // FALLTHROUGH
+    default:
+      break;
+  }
+
+  h ^= h >> r;
+
+  // not doing the final step here, because this will be done by keyToIdx
+  // anyways
+  // h *= m;
+  // h ^= h >> r;
+  return static_cast<size_t>(h);
+}
+
+inline size_t hash_int(uint64_t x) noexcept {
+  // tried lots of different hashes, let's stick with murmurhash3. It's simple,
+  // fast, well tested,
+  // and doesn't need any special 128bit operations.
+  x ^= x >> 33U;
+  x *= UINT64_C(0xff51afd7ed558ccd);
+  x ^= x >> 33U;
+
+  // not doing the final step here, because this will be done by keyToIdx
+  // anyways
+  // x *= UINT64_C(0xc4ceb9fe1a85ec53);
+  // x ^= x >> 33U;
+  return static_cast<size_t>(x);
+}
+
+// A thin wrapper around std::hash, performing an additional simple mixing step
+// of the result.
+template <typename T, typename Enable = void>
+struct hash : public std::hash<T> {
+  size_t operator()(T const &obj) const noexcept(noexcept(
+      std::declval<std::hash<T>>().operator()(std::declval<T const &>()))) {
+    // call base hash
+    auto result = std::hash<T>::operator()(obj);
+    // return mixed of that, to be save against identity has
+    return hash_int(static_cast<detail::SizeT>(result));
+  }
+};
+
+template <typename CharT>
+struct hash<std::basic_string<CharT>> {
+  size_t operator()(std::basic_string<CharT> const &str) const noexcept {
+    return hash_bytes(str.data(), sizeof(CharT) * str.size());
+  }
+};
+
+#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17)
+template <typename CharT>
+struct hash<std::basic_string_view<CharT>> {
+  size_t operator()(std::basic_string_view<CharT> const &sv) const noexcept {
+    return hash_bytes(sv.data(), sizeof(CharT) * sv.size());
+  }
+};
+#endif
+
+template <class T>
+struct hash<T *> {
+  size_t operator()(T *ptr) const noexcept {
+    return hash_int(reinterpret_cast<detail::SizeT>(ptr));
+  }
+};
+
+template <class T>
+struct hash<std::unique_ptr<T>> {
+  size_t operator()(std::unique_ptr<T> const &ptr) const noexcept {
+    return hash_int(reinterpret_cast<detail::SizeT>(ptr.get()));
+  }
+};
+
+template <class T>
+struct hash<std::shared_ptr<T>> {
+  size_t operator()(std::shared_ptr<T> const &ptr) const noexcept {
+    return hash_int(reinterpret_cast<detail::SizeT>(ptr.get()));
+  }
+};
+
+template <typename Enum>
+struct hash<Enum, typename std::enable_if<std::is_enum<Enum>::value>::type> {
+  size_t operator()(Enum e) const noexcept {
+    using Underlying = typename std::underlying_type<Enum>::type;
+    return hash<Underlying>{}(static_cast<Underlying>(e));
+  }
+};
+
+#define ROBIN_HOOD_HASH_INT(T)                       \
+  template <>                                        \
+  struct hash<T> {                                   \
+    size_t operator()(T const &obj) const noexcept { \
+      return hash_int(static_cast<uint64_t>(obj));   \
+    }                                                \
+  }
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuseless-cast"
+#endif
+// see https://en.cppreference.com/w/cpp/utility/hash
+ROBIN_HOOD_HASH_INT(bool);
+ROBIN_HOOD_HASH_INT(char);
+ROBIN_HOOD_HASH_INT(signed char);
+ROBIN_HOOD_HASH_INT(unsigned char);
+ROBIN_HOOD_HASH_INT(char16_t);
+ROBIN_HOOD_HASH_INT(char32_t);
+#if ROBIN_HOOD(HAS_NATIVE_WCHART)
+ROBIN_HOOD_HASH_INT(wchar_t);
+#endif
+ROBIN_HOOD_HASH_INT(short);
+ROBIN_HOOD_HASH_INT(unsigned short);
+ROBIN_HOOD_HASH_INT(int);
+ROBIN_HOOD_HASH_INT(unsigned int);
+ROBIN_HOOD_HASH_INT(long);
+ROBIN_HOOD_HASH_INT(long long);
+ROBIN_HOOD_HASH_INT(unsigned long);
+ROBIN_HOOD_HASH_INT(unsigned long long);
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+namespace detail {
+
+template <typename T>
+struct void_type {
+  using type = void;
+};
+
+template <typename T, typename = void>
+struct has_is_transparent : public std::false_type {};
+
+template <typename T>
+struct has_is_transparent<T,
+                          typename void_type<typename T::is_transparent>::type>
+    : public std::true_type {};
+
+// using wrapper classes for hash and key_equal prevents the diamond problem
+// when the same type
+// is used. see https://stackoverflow.com/a/28771920/48181
+template <typename T>
+struct WrapHash : public T {
+  WrapHash() = default;
+  explicit WrapHash(T const &o) noexcept(noexcept(T(std::declval<T const &>())))
+      : T(o) {}
+};
+
+template <typename T>
+struct WrapKeyEqual : public T {
+  WrapKeyEqual() = default;
+  explicit WrapKeyEqual(T const &o) noexcept(
+      noexcept(T(std::declval<T const &>())))
+      : T(o) {}
+};
+
+// A highly optimized hashmap implementation, using the Robin Hood algorithm.
+//
+// In most cases, this map should be usable as a drop-in replacement for
+// std::unordered_map, but
+// be about 2x faster in most cases and require much less allocations.
+//
+// This implementation uses the following memory layout:
+//
+// [Node, Node, ... Node | info, info, ... infoSentinel ]
+//
+// * Node: either a DataNode that directly has the std::pair<key, val> as
+// member,
+//   or a DataNode with a pointer to std::pair<key,val>. Which DataNode
+//   representation to use
+//   depends on how fast the swap() operation is. Heuristically, this is
+//   automatically choosen
+//   based on sizeof(). there are always 2^n Nodes.
+//
+// * info: Each Node in the map has a corresponding info byte, so there are 2^n
+// info bytes.
+//   Each byte is initialized to 0, meaning the corresponding Node is empty. Set
+//   to 1 means the
+//   corresponding node contains data. Set to 2 means the corresponding Node is
+//   filled, but it
+//   actually belongs to the previous position and was pushed out because that
+//   place is already
+//   taken.
+//
+// * infoSentinel: Sentinel byte set to 1, so that iterator's ++ can stop at
+// end() without the
+//   need for a idx variable.
+//
+// According to STL, order of templates has effect on throughput. That's why
+// I've moved the
+// boolean to the front.
+// https://www.reddit.com/r/cpp/comments/ahp6iu/compile_time_binary_size_reductions_and_cs_future/eeguck4/
+template <bool IsFlat, size_t MaxLoadFactor100, typename Key, typename T,
+          typename Hash, typename KeyEqual>
+class Table
+    : public WrapHash<Hash>,
+      public WrapKeyEqual<KeyEqual>,
+      detail::NodeAllocator<
+          typename std::conditional<
+              std::is_void<T>::value, Key,
+              robin_hood::pair<
+                  typename std::conditional<IsFlat, Key, Key const>::type,
+                  T>>::type,
+          4, 16384, IsFlat> {
+ public:
+  static constexpr bool is_flat = IsFlat;
+  static constexpr bool is_map = !std::is_void<T>::value;
+  static constexpr bool is_set = !is_map;
+  static constexpr bool is_transparent =
+      has_is_transparent<Hash>::value && has_is_transparent<KeyEqual>::value;
+
+  using key_type = Key;
+  using mapped_type = T;
+  using value_type = typename std::conditional<
+      is_set, Key,
+      robin_hood::pair<typename std::conditional<is_flat, Key, Key const>::type,
+                       T>>::type;
+  using size_type = size_t;
+  using hasher = Hash;
+  using key_equal = KeyEqual;
+  using Self =
+      Table<IsFlat, MaxLoadFactor100, key_type, mapped_type, hasher, key_equal>;
+
+ private:
+  static_assert(MaxLoadFactor100 > 10 && MaxLoadFactor100 < 100,
+                "MaxLoadFactor100 needs to be >10 && < 100");
+
+  using WHash = WrapHash<Hash>;
+  using WKeyEqual = WrapKeyEqual<KeyEqual>;
+
+  // configuration defaults
+
+  // make sure we have 8 elements, needed to quickly rehash mInfo
+  static constexpr size_t InitialNumElements = sizeof(uint64_t);
+  static constexpr uint32_t InitialInfoNumBits = 5;
+  static constexpr uint8_t InitialInfoInc = 1U << InitialInfoNumBits;
+  static constexpr size_t InfoMask = InitialInfoInc - 1U;
+  static constexpr uint8_t InitialInfoHashShift = 0;
+  using DataPool = detail::NodeAllocator<value_type, 4, 16384, IsFlat>;
+
+  // type needs to be wider than uint8_t.
+  using InfoType = uint32_t;
+
+  // DataNode ////////////////////////////////////////////////////////
+
+  // Primary template for the data node. We have special implementations for
+  // small and big
+  // objects. For large objects it is assumed that swap() is fairly slow, so we
+  // allocate these
+  // on the heap so swap merely swaps a pointer.
+  template <typename M, bool>
+  class DataNode {};
+
+  // Small: just allocate on the stack.
+  template <typename M>
+  class DataNode<M, true> final {
+   public:
+    template <typename... Args>
+    explicit DataNode(
+        M &ROBIN_HOOD_UNUSED(map) /*unused*/,
+        Args &&... args) noexcept(noexcept(value_type(std::
+                                                          forward<Args>(
+                                                              args)...)))
+        : mData(std::forward<Args>(args)...) {}
+
+    DataNode(
+        M &ROBIN_HOOD_UNUSED(map) /*unused*/,
+        DataNode<M, true>
+            &&n) noexcept(std::is_nothrow_move_constructible<value_type>::value)
+        : mData(std::move(n.mData)) {}
+
+    // doesn't do anything
+    void destroy(M &ROBIN_HOOD_UNUSED(map) /*unused*/) noexcept {}
+    void destroyDoNotDeallocate() noexcept {}
+
+    value_type const *operator->() const noexcept { return &mData; }
+    value_type *operator->() noexcept { return &mData; }
+
+    const value_type &operator*() const noexcept { return mData; }
+
+    value_type &operator*() noexcept { return mData; }
+
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, typename VT::first_type &>::type
+        getFirst() noexcept {
+      return mData.first;
+    }
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_set, VT &>::type getFirst() noexcept {
+      return mData;
+    }
+
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, typename VT::first_type const &>::type
+        getFirst() const noexcept {
+      return mData.first;
+    }
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_set, VT const &>::type getFirst() const
+        noexcept {
+      return mData;
+    }
+
+    template <typename MT = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, MT &>::type getSecond() noexcept {
+      return mData.second;
+    }
+
+    template <typename MT = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_set, MT const &>::type getSecond() const
+        noexcept {
+      return mData.second;
+    }
+
+    void swap(DataNode<M, true> &o) noexcept(
+        noexcept(std::declval<value_type>().swap(std::declval<value_type>()))) {
+      mData.swap(o.mData);
+    }
+
+   private:
+    value_type mData;
+  };
+
+  // big object: allocate on heap.
+  template <typename M>
+  class DataNode<M, false> {
+   public:
+    template <typename... Args>
+    explicit DataNode(M &map, Args &&... args) : mData(map.allocate()) {
+      ::new (static_cast<void *>(mData))
+          value_type(std::forward<Args>(args)...);
+    }
+
+    DataNode(M &ROBIN_HOOD_UNUSED(map) /*unused*/,
+             DataNode<M, false> &&n) noexcept : mData(std::move(n.mData)) {}
+
+    void destroy(M &map) noexcept {
+      // don't deallocate, just put it into list of datapool.
+      mData->~value_type();
+      map.deallocate(mData);
+    }
+
+    void destroyDoNotDeallocate() noexcept { mData->~value_type(); }
+
+    value_type const *operator->() const noexcept { return mData; }
+
+    value_type *operator->() noexcept { return mData; }
+
+    const value_type &operator*() const { return *mData; }
+
+    value_type &operator*() { return *mData; }
+
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, typename VT::first_type &>::type
+        getFirst() noexcept {
+      return mData->first;
+    }
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_set, VT &>::type getFirst() noexcept {
+      return *mData;
+    }
+
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, typename VT::first_type const &>::type
+        getFirst() const noexcept {
+      return mData->first;
+    }
+    template <typename VT = value_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_set, VT const &>::type getFirst() const
+        noexcept {
+      return *mData;
+    }
+
+    template <typename MT = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, MT &>::type getSecond() noexcept {
+      return mData->second;
+    }
+
+    template <typename MT = mapped_type>
+    ROBIN_HOOD(NODISCARD)
+    typename std::enable_if<is_map, MT const &>::type getSecond() const
+        noexcept {
+      return mData->second;
+    }
+
+    void swap(DataNode<M, false> &o) noexcept {
+      using std::swap;
+      swap(mData, o.mData);
+    }
+
+   private:
+    value_type *mData;
+  };
+
+  using Node = DataNode<Self, IsFlat>;
+
+  // helpers for insertKeyPrepareEmptySpot: extract first entry (only const
+  // required)
+  ROBIN_HOOD(NODISCARD)
+  key_type const &getFirstConst(Node const &n) const noexcept {
+    return n.getFirst();
+  }
+
+  // in case we have void mapped_type, we are not using a pair, thus we just
+  // route k through.
+  // No need to disable this because it's just not used if not applicable.
+  ROBIN_HOOD(NODISCARD)
+  key_type const &getFirstConst(key_type const &k) const noexcept { return k; }
+
+  // in case we have non-void mapped_type, we have a standard robin_hood::pair
+  template <typename Q = mapped_type>
+  ROBIN_HOOD(NODISCARD)
+  typename std::enable_if<!std::is_void<Q>::value, key_type const &>::type
+      getFirstConst(value_type const &vt) const noexcept {
+    return vt.first;
+  }
+
+  // Cloner //////////////////////////////////////////////////////////
+
+  template <typename M, bool UseMemcpy>
+  struct Cloner;
+
+  // fast path: Just copy data, without allocating anything.
+  template <typename M>
+  struct Cloner<M, true> {
+    void operator()(M const &source, M &target) const {
+      auto const *const src = reinterpret_cast<char const *>(source.mKeyVals);
+      auto *tgt = reinterpret_cast<char *>(target.mKeyVals);
+      auto const numElementsWithBuffer =
+          target.calcNumElementsWithBuffer(target.mMask + 1);
+      std::copy(src, src + target.calcNumBytesTotal(numElementsWithBuffer),
+                tgt);
+    }
+  };
+
+  template <typename M>
+  struct Cloner<M, false> {
+    void operator()(M const &s, M &t) const {
+      auto const numElementsWithBuffer =
+          t.calcNumElementsWithBuffer(t.mMask + 1);
+      std::copy(s.mInfo, s.mInfo + t.calcNumBytesInfo(numElementsWithBuffer),
+                t.mInfo);
+
+      for (size_t i = 0; i < numElementsWithBuffer; ++i) {
+        if (t.mInfo[i]) {
+          ::new (static_cast<void *>(t.mKeyVals + i)) Node(t, *s.mKeyVals[i]);
+        }
+      }
+    }
+  };
+
+  // Destroyer ///////////////////////////////////////////////////////
+
+  template <typename M, bool IsFlatAndTrivial>
+  struct Destroyer {};
+
+  template <typename M>
+  struct Destroyer<M, true> {
+    void nodes(M &m) const noexcept { m.mNumElements = 0; }
+
+    void nodesDoNotDeallocate(M &m) const noexcept { m.mNumElements = 0; }
+  };
+
+  template <typename M>
+  struct Destroyer<M, false> {
+    void nodes(M &m) const noexcept {
+      m.mNumElements = 0;
+      // clear also resets mInfo to 0, that's sometimes not necessary.
+      auto const numElementsWithBuffer =
+          m.calcNumElementsWithBuffer(m.mMask + 1);
+
+      for (size_t idx = 0; idx < numElementsWithBuffer; ++idx) {
+        if (0 != m.mInfo[idx]) {
+          Node &n = m.mKeyVals[idx];
+          n.destroy(m);
+          n.~Node();
+        }
+      }
+    }
+
+    void nodesDoNotDeallocate(M &m) const noexcept {
+      m.mNumElements = 0;
+      // clear also resets mInfo to 0, that's sometimes not necessary.
+      auto const numElementsWithBuffer =
+          m.calcNumElementsWithBuffer(m.mMask + 1);
+      for (size_t idx = 0; idx < numElementsWithBuffer; ++idx) {
+        if (0 != m.mInfo[idx]) {
+          Node &n = m.mKeyVals[idx];
+          n.destroyDoNotDeallocate();
+          n.~Node();
+        }
+      }
+    }
+  };
+
+  // Iter ////////////////////////////////////////////////////////////
+
+  struct fast_forward_tag {};
+
+  // generic iterator for both const_iterator and iterator.
+  template <bool IsConst>
+  // NOLINTNEXTLINE(hicpp-special-member-functions,cppcoreguidelines-special-member-functions)
+  class Iter {
+   private:
+    using NodePtr =
+        typename std::conditional<IsConst, Node const *, Node *>::type;
+
+   public:
+    using difference_type = std::ptrdiff_t;
+    using value_type = typename Self::value_type;
+    using reference = typename std::conditional<IsConst, value_type const &,
+                                                value_type &>::type;
+    using pointer = typename std::conditional<IsConst, value_type const *,
+                                              value_type *>::type;
+    using iterator_category = std::forward_iterator_tag;
+
+    // default constructed iterator can be compared to itself, but WON'T return
+    // true when
+    // compared to end().
+    Iter() = default;
+
+    // Rule of zero: nothing specified. The conversion constructor is only
+    // enabled for
+    // iterator to const_iterator, so it doesn't accidentally work as a copy
+    // ctor.
+
+    // Conversion constructor from iterator to const_iterator.
+    template <bool OtherIsConst, typename = typename std::enable_if<
+                                     IsConst && !OtherIsConst>::type>
+    // NOLINTNEXTLINE(hicpp-explicit-conversions)
+    Iter(Iter<OtherIsConst> const &other) noexcept : mKeyVals(other.mKeyVals),
+                                                     mInfo(other.mInfo) {}
+
+    Iter(NodePtr valPtr, uint8_t const *infoPtr) noexcept : mKeyVals(valPtr),
+                                                            mInfo(infoPtr) {}
+
+    Iter(NodePtr valPtr, uint8_t const *infoPtr,
+         fast_forward_tag ROBIN_HOOD_UNUSED(tag) /*unused*/) noexcept
+        : mKeyVals(valPtr),
+          mInfo(infoPtr) {
+      fastForward();
+    }
+
+    template <bool OtherIsConst, typename = typename std::enable_if<
+                                     IsConst && !OtherIsConst>::type>
+    Iter &operator=(Iter<OtherIsConst> const &other) noexcept {
+      mKeyVals = other.mKeyVals;
+      mInfo = other.mInfo;
+      return *this;
+    }
+
+    // prefix increment. Undefined behavior if we are at end()!
+    Iter &operator++() noexcept {
+      mInfo++;
+      mKeyVals++;
+      fastForward();
+      return *this;
+    }
+
+    Iter operator++(int)noexcept {
+      Iter tmp = *this;
+      ++(*this);
+      return tmp;
+    }
+
+    reference operator*() const { return **mKeyVals; }
+
+    pointer operator->() const { return &**mKeyVals; }
+
+    template <bool O>
+    bool operator==(Iter<O> const &o) const noexcept {
+      return mKeyVals == o.mKeyVals;
+    }
+
+    template <bool O>
+    bool operator!=(Iter<O> const &o) const noexcept {
+      return mKeyVals != o.mKeyVals;
+    }
+
+   private:
+    // fast forward to the next non-free info byte
+    // I've tried a few variants that don't depend on intrinsics, but
+    // unfortunately they are
+    // quite a bit slower than this one. So I've reverted that change again. See
+    // map_benchmark.
+    void fastForward() noexcept {
+      size_t n = 0;
+      while (0U == (n = detail::unaligned_load<size_t>(mInfo))) {
+        mInfo += sizeof(size_t);
+        mKeyVals += sizeof(size_t);
+      }
+#if defined(ROBIN_HOOD_DISABLE_INTRINSICS)
+      // we know for certain that within the next 8 bytes we'll find a non-zero
+      // one.
+      if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load<uint32_t>(mInfo))) {
+        mInfo += 4;
+        mKeyVals += 4;
+      }
+      if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load<uint16_t>(mInfo))) {
+        mInfo += 2;
+        mKeyVals += 2;
+      }
+      if (ROBIN_HOOD_UNLIKELY(0U == *mInfo)) {
+        mInfo += 1;
+        mKeyVals += 1;
+      }
+#else
+#if ROBIN_HOOD(LITTLE_ENDIAN)
+      auto inc = ROBIN_HOOD_COUNT_TRAILING_ZEROES(n) / 8;
+#else
+      auto inc = ROBIN_HOOD_COUNT_LEADING_ZEROES(n) / 8;
+#endif
+      mInfo += inc;
+      mKeyVals += inc;
+#endif
+    }
+
+    friend class Table<IsFlat, MaxLoadFactor100, key_type, mapped_type, hasher,
+                       key_equal>;
+    NodePtr mKeyVals{nullptr};
+    uint8_t const *mInfo{nullptr};
+  };
+
+  ////////////////////////////////////////////////////////////////////
+
+  // highly performance relevant code.
+  // Lower bits are used for indexing into the array (2^n size)
+  // The upper 1-5 bits need to be a reasonable good hash, to save comparisons.
+  template <typename HashKey>
+  void keyToIdx(HashKey &&key, size_t *idx, InfoType *info) const {
+    // In addition to whatever hash is used, add another mul & shift so we get
+    // better hashing.
+    // This serves as a bad hash prevention, if the given data is
+    // badly mixed.
+    auto h = static_cast<uint64_t>(WHash::operator()(key));
+
+    h *= mHashMultiplier;
+    h ^= h >> 33U;
+
+    // the lower InitialInfoNumBits are reserved for info.
+    *info = mInfoInc + static_cast<InfoType>((h & InfoMask) >> mInfoHashShift);
+    *idx = (static_cast<size_t>(h) >> InitialInfoNumBits) & mMask;
+  }
+
+  // forwards the index by one, wrapping around at the end
+  void next(InfoType *info, size_t *idx) const noexcept {
+    *idx = *idx + 1;
+    *info += mInfoInc;
+  }
+
+  void nextWhileLess(InfoType *info, size_t *idx) const noexcept {
+    // unrolling this by hand did not bring any speedups.
+    while (*info < mInfo[*idx]) {
+      next(info, idx);
+    }
+  }
+
+  // Shift everything up by one element. Tries to move stuff around.
+  void shiftUp(size_t startIdx, size_t const insertion_idx) noexcept(
+      std::is_nothrow_move_assignable<Node>::value) {
+    auto idx = startIdx;
+    ::new (static_cast<void *>(mKeyVals + idx))
+        Node(std::move(mKeyVals[idx - 1]));
+    while (--idx != insertion_idx) {
+      mKeyVals[idx] = std::move(mKeyVals[idx - 1]);
+    }
+
+    idx = startIdx;
+    while (idx != insertion_idx) {
+      ROBIN_HOOD_COUNT(shiftUp)
+      mInfo[idx] = static_cast<uint8_t>(mInfo[idx - 1] + mInfoInc);
+      if (ROBIN_HOOD_UNLIKELY(mInfo[idx] + mInfoInc > 0xFF)) {
+        mMaxNumElementsAllowed = 0;
+      }
+      --idx;
+    }
+  }
+
+  void shiftDown(size_t idx) noexcept(
+      std::is_nothrow_move_assignable<Node>::value) {
+    // until we find one that is either empty or has zero offset.
+    // TODO(martinus) we don't need to move everything, just the last one for
+    // the same
+    // bucket.
+    mKeyVals[idx].destroy(*this);
+
+    // until we find one that is either empty or has zero offset.
+    while (mInfo[idx + 1] >= 2 * mInfoInc) {
+      ROBIN_HOOD_COUNT(shiftDown)
+      mInfo[idx] = static_cast<uint8_t>(mInfo[idx + 1] - mInfoInc);
+      mKeyVals[idx] = std::move(mKeyVals[idx + 1]);
+      ++idx;
+    }
+
+    mInfo[idx] = 0;
+    // don't destroy, we've moved it
+    // mKeyVals[idx].destroy(*this);
+    mKeyVals[idx].~Node();
+  }
+
+  // copy of find(), except that it returns iterator instead of const_iterator.
+  template <typename Other>
+  ROBIN_HOOD(NODISCARD)
+  size_t findIdx(Other const &key) const {
+    size_t idx{};
+    InfoType info{};
+    keyToIdx(key, &idx, &info);
+
+    do {
+      // unrolling this twice gives a bit of a speedup. More unrolling did not
+      // help.
+      if (info == mInfo[idx] && ROBIN_HOOD_LIKELY(WKeyEqual::operator()(
+                                    key, mKeyVals[idx].getFirst()))) {
+        return idx;
+      }
+      next(&info, &idx);
+      if (info == mInfo[idx] && ROBIN_HOOD_LIKELY(WKeyEqual::operator()(
+                                    key, mKeyVals[idx].getFirst()))) {
+        return idx;
+      }
+      next(&info, &idx);
+    } while (info <= mInfo[idx]);
+
+    // nothing found!
+    return mMask == 0
+               ? 0
+               : static_cast<size_t>(std::distance(
+                     mKeyVals,
+                     reinterpret_cast_no_cast_align_warning<Node *>(mInfo)));
+  }
+
+  void cloneData(const Table &o) {
+    Cloner<Table, IsFlat && ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(Node)>()(o, *this);
+  }
+
+  // inserts a keyval that is guaranteed to be new, e.g. when the hashmap is
+  // resized.
+  // @return True on success, false if something went wrong
+  void insert_move(Node &&keyval) {
+    // we don't retry, fail if overflowing
+    // don't need to check max num elements
+    if (0 == mMaxNumElementsAllowed && !try_increase_info()) {
+      throwOverflowError();
+    }
+
+    size_t idx{};
+    InfoType info{};
+    keyToIdx(keyval.getFirst(), &idx, &info);
+
+    // skip forward. Use <= because we are certain that the element is not
+    // there.
+    while (info <= mInfo[idx]) {
+      idx = idx + 1;
+      info += mInfoInc;
+    }
+
+    // key not found, so we are now exactly where we want to insert it.
+    auto const insertion_idx = idx;
+    auto const insertion_info = static_cast<uint8_t>(info);
+    if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) {
+      mMaxNumElementsAllowed = 0;
+    }
+
+    // find an empty spot
+    while (0 != mInfo[idx]) {
+      next(&info, &idx);
+    }
+
+    auto &l = mKeyVals[insertion_idx];
+    if (idx == insertion_idx) {
+      ::new (static_cast<void *>(&l)) Node(std::move(keyval));
+    } else {
+      shiftUp(idx, insertion_idx);
+      l = std::move(keyval);
+    }
+
+    // put at empty spot
+    mInfo[insertion_idx] = insertion_info;
+
+    ++mNumElements;
+  }
+
+ public:
+  using iterator = Iter<false>;
+  using const_iterator = Iter<true>;
+
+  Table() noexcept(noexcept(Hash()) && noexcept(KeyEqual()))
+      : WHash(), WKeyEqual() {
+    ROBIN_HOOD_TRACE(this)
+  }
+
+  // Creates an empty hash map. Nothing is allocated yet, this happens at the
+  // first insert.
+  // This tremendously speeds up ctor & dtor of a map that never receives an
+  // element. The
+  // penalty is payed at the first insert, and not before. Lookup of this empty
+  // map works
+  // because everybody points to DummyInfoByte::b. parameter bucket_count is
+  // dictated by the
+  // standard, but we can ignore it.
+  explicit Table(
+      size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/, const Hash &h = Hash{},
+      const KeyEqual &equal = KeyEqual{}) noexcept(noexcept(Hash(h)) &&
+                                                   noexcept(KeyEqual(equal)))
+      : WHash(h), WKeyEqual(equal) {
+    ROBIN_HOOD_TRACE(this)
+  }
+
+  template <typename Iter>
+  Table(Iter first, Iter last,
+        size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0,
+        const Hash &h = Hash{}, const KeyEqual &equal = KeyEqual{})
+      : WHash(h), WKeyEqual(equal) {
+    ROBIN_HOOD_TRACE(this)
+    insert(first, last);
+  }
+
+  Table(std::initializer_list<value_type> initlist,
+        size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0,
+        const Hash &h = Hash{}, const KeyEqual &equal = KeyEqual{})
+      : WHash(h), WKeyEqual(equal) {
+    ROBIN_HOOD_TRACE(this)
+    insert(initlist.begin(), initlist.end());
+  }
+
+  Table(Table &&o) noexcept : WHash(std::move(static_cast<WHash &>(o))),
+                              WKeyEqual(std::move(static_cast<WKeyEqual &>(o))),
+                              DataPool(std::move(static_cast<DataPool &>(o))) {
+    ROBIN_HOOD_TRACE(this)
+    if (o.mMask) {
+      mHashMultiplier = std::move(o.mHashMultiplier);
+      mKeyVals = std::move(o.mKeyVals);
+      mInfo = std::move(o.mInfo);
+      mNumElements = std::move(o.mNumElements);
+      mMask = std::move(o.mMask);
+      mMaxNumElementsAllowed = std::move(o.mMaxNumElementsAllowed);
+      mInfoInc = std::move(o.mInfoInc);
+      mInfoHashShift = std::move(o.mInfoHashShift);
+      // set other's mask to 0 so its destructor won't do anything
+      o.init();
+    }
+  }
+
+  Table &operator=(Table &&o) noexcept {
+    ROBIN_HOOD_TRACE(this)
+    if (&o != this) {
+      if (o.mMask) {
+        // only move stuff if the other map actually has some data
+        destroy();
+        mHashMultiplier = std::move(o.mHashMultiplier);
+        mKeyVals = std::move(o.mKeyVals);
+        mInfo = std::move(o.mInfo);
+        mNumElements = std::move(o.mNumElements);
+        mMask = std::move(o.mMask);
+        mMaxNumElementsAllowed = std::move(o.mMaxNumElementsAllowed);
+        mInfoInc = std::move(o.mInfoInc);
+        mInfoHashShift = std::move(o.mInfoHashShift);
+        WHash::operator=(std::move(static_cast<WHash &>(o)));
+        WKeyEqual::operator=(std::move(static_cast<WKeyEqual &>(o)));
+        DataPool::operator=(std::move(static_cast<DataPool &>(o)));
+
+        o.init();
+
+      } else {
+        // nothing in the other map => just clear us.
+        clear();
+      }
+    }
+    return *this;
+  }
+
+  Table(const Table &o)
+      : WHash(static_cast<const WHash &>(o)),
+        WKeyEqual(static_cast<const WKeyEqual &>(o)),
+        DataPool(static_cast<const DataPool &>(o)) {
+    ROBIN_HOOD_TRACE(this)
+    if (!o.empty()) {
+      // not empty: create an exact copy. it is also possible to just iterate
+      // through all
+      // elements and insert them, but copying is probably faster.
+
+      auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1);
+      auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer);
+
+      ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal("
+                                    << numElementsWithBuffer << ")")
+      mHashMultiplier = o.mHashMultiplier;
+      mKeyVals = static_cast<Node *>(
+          detail::assertNotNull<std::bad_alloc>(std::malloc(numBytesTotal)));
+      // no need for calloc because clonData does memcpy
+      mInfo = reinterpret_cast<uint8_t *>(mKeyVals + numElementsWithBuffer);
+      mNumElements = o.mNumElements;
+      mMask = o.mMask;
+      mMaxNumElementsAllowed = o.mMaxNumElementsAllowed;
+      mInfoInc = o.mInfoInc;
+      mInfoHashShift = o.mInfoHashShift;
+      cloneData(o);
+    }
+  }
+
+  // Creates a copy of the given map. Copy constructor of each entry is used.
+  // Not sure why clang-tidy thinks this doesn't handle self assignment, it does
+  // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp)
+  Table &operator=(Table const &o) {
+    ROBIN_HOOD_TRACE(this)
+    if (&o == this) {
+      // prevent assigning of itself
+      return *this;
+    }
+
+    // we keep using the old allocator and not assign the new one, because we
+    // want to keep
+    // the memory available. when it is the same size.
+    if (o.empty()) {
+      if (0 == mMask) {
+        // nothing to do, we are empty too
+        return *this;
+      }
+
+      // not empty: destroy what we have there
+      // clear also resets mInfo to 0, that's sometimes not necessary.
+      destroy();
+      init();
+      WHash::operator=(static_cast<const WHash &>(o));
+      WKeyEqual::operator=(static_cast<const WKeyEqual &>(o));
+      DataPool::operator=(static_cast<DataPool const &>(o));
+
+      return *this;
+    }
+
+    // clean up old stuff
+    Destroyer<Self, IsFlat && std::is_trivially_destructible<Node>::value>{}
+        .nodes(*this);
+
+    if (mMask != o.mMask) {
+      // no luck: we don't have the same array size allocated, so we need to
+      // realloc.
+      if (0 != mMask) {
+        // only deallocate if we actually have data!
+        ROBIN_HOOD_LOG("std::free")
+        std::free(mKeyVals);
+      }
+
+      auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1);
+      auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer);
+      ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal("
+                                    << numElementsWithBuffer << ")")
+      mKeyVals = static_cast<Node *>(
+          detail::assertNotNull<std::bad_alloc>(std::malloc(numBytesTotal)));
+
+      // no need for calloc here because cloneData performs a memcpy.
+      mInfo = reinterpret_cast<uint8_t *>(mKeyVals + numElementsWithBuffer);
+      // sentinel is set in cloneData
+    }
+    WHash::operator=(static_cast<const WHash &>(o));
+    WKeyEqual::operator=(static_cast<const WKeyEqual &>(o));
+    DataPool::operator=(static_cast<DataPool const &>(o));
+    mHashMultiplier = o.mHashMultiplier;
+    mNumElements = o.mNumElements;
+    mMask = o.mMask;
+    mMaxNumElementsAllowed = o.mMaxNumElementsAllowed;
+    mInfoInc = o.mInfoInc;
+    mInfoHashShift = o.mInfoHashShift;
+    cloneData(o);
+
+    return *this;
+  }
+
+  // Swaps everything between the two maps.
+  void swap(Table &o) {
+    ROBIN_HOOD_TRACE(this)
+    using std::swap;
+    swap(o, *this);
+  }
+
+  // Clears all data, without resizing.
+  void clear() {
+    ROBIN_HOOD_TRACE(this)
+    if (empty()) {
+      // don't do anything! also important because we don't want to write to
+      // DummyInfoByte::b, even though we would just write 0 to it.
+      return;
+    }
+
+    Destroyer<Self, IsFlat && std::is_trivially_destructible<Node>::value>{}
+        .nodes(*this);
+
+    auto const numElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1);
+    // clear everything, then set the sentinel again
+    uint8_t const z = 0;
+    std::fill(mInfo, mInfo + calcNumBytesInfo(numElementsWithBuffer), z);
+    mInfo[numElementsWithBuffer] = 1;
+
+    mInfoInc = InitialInfoInc;
+    mInfoHashShift = InitialInfoHashShift;
+  }
+
+  // Destroys the map and all it's contents.
+  ~Table() {
+    ROBIN_HOOD_TRACE(this)
+    destroy();
+  }
+
+  // Checks if both tables contain the same entries. Order is irrelevant.
+  bool operator==(const Table &other) const {
+    ROBIN_HOOD_TRACE(this)
+    if (other.size() != size()) {
+      return false;
+    }
+    for (auto const &otherEntry : other) {
+      if (!has(otherEntry)) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+
+  bool operator!=(const Table &other) const {
+    ROBIN_HOOD_TRACE(this)
+    return !operator==(other);
+  }
+
+  template <typename Q = mapped_type>
+  typename std::enable_if<!std::is_void<Q>::value, Q &>::type operator[](
+      const key_type &key) {
+    ROBIN_HOOD_TRACE(this)
+    auto idxAndState = insertKeyPrepareEmptySpot(key);
+    switch (idxAndState.second) {
+      case InsertionState::key_found:
+        break;
+
+      case InsertionState::new_node:
+        ::new (static_cast<void *>(&mKeyVals[idxAndState.first]))
+            Node(*this, std::piecewise_construct, std::forward_as_tuple(key),
+                 std::forward_as_tuple());
+        break;
+
+      case InsertionState::overwrite_node:
+        mKeyVals[idxAndState.first] =
+            Node(*this, std::piecewise_construct, std::forward_as_tuple(key),
+                 std::forward_as_tuple());
+        break;
+
+      case InsertionState::overflow_error:
+        throwOverflowError();
+    }
+
+    return mKeyVals[idxAndState.first].getSecond();
+  }
+
+  template <typename Q = mapped_type>
+  typename std::enable_if<!std::is_void<Q>::value, Q &>::type operator[](
+      key_type &&key) {
+    ROBIN_HOOD_TRACE(this)
+    auto idxAndState = insertKeyPrepareEmptySpot(key);
+    switch (idxAndState.second) {
+      case InsertionState::key_found:
+        break;
+
+      case InsertionState::new_node:
+        ::new (static_cast<void *>(&mKeyVals[idxAndState.first])) Node(
+            *this, std::piecewise_construct,
+            std::forward_as_tuple(std::move(key)), std::forward_as_tuple());
+        break;
+
+      case InsertionState::overwrite_node:
+        mKeyVals[idxAndState.first] = Node(
+            *this, std::piecewise_construct,
+            std::forward_as_tuple(std::move(key)), std::forward_as_tuple());
+        break;
+
+      case InsertionState::overflow_error:
+        throwOverflowError();
+    }
+
+    return mKeyVals[idxAndState.first].getSecond();
+  }
+
+  template <typename Iter>
+  void insert(Iter first, Iter last) {
+    for (; first != last; ++first) {
+      // value_type ctor needed because this might be called with std::pair's
+      insert(value_type(*first));
+    }
+  }
+
+  void insert(std::initializer_list<value_type> ilist) {
+    for (auto &&vt : ilist) {
+      insert(std::move(vt));
+    }
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> emplace(Args &&... args) {
+    ROBIN_HOOD_TRACE(this)
+    Node n{*this, std::forward<Args>(args)...};
+    auto idxAndState = insertKeyPrepareEmptySpot(getFirstConst(n));
+    switch (idxAndState.second) {
+      case InsertionState::key_found:
+        n.destroy(*this);
+        break;
+
+      case InsertionState::new_node:
+        ::new (static_cast<void *>(&mKeyVals[idxAndState.first]))
+            Node(*this, std::move(n));
+        break;
+
+      case InsertionState::overwrite_node:
+        mKeyVals[idxAndState.first] = std::move(n);
+        break;
+
+      case InsertionState::overflow_error:
+        n.destroy(*this);
+        throwOverflowError();
+        break;
+    }
+
+    return std::make_pair(
+        iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first),
+        InsertionState::key_found != idxAndState.second);
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> try_emplace(const key_type &key, Args &&... args) {
+    return try_emplace_impl(key, std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> try_emplace(key_type &&key, Args &&... args) {
+    return try_emplace_impl(std::move(key), std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> try_emplace(const_iterator hint,
+                                        const key_type &key, Args &&... args) {
+    (void)hint;
+    return try_emplace_impl(key, std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  std::pair<iterator, bool> try_emplace(const_iterator hint, key_type &&key,
+                                        Args &&... args) {
+    (void)hint;
+    return try_emplace_impl(std::move(key), std::forward<Args>(args)...);
+  }
+
+  template <typename Mapped>
+  std::pair<iterator, bool> insert_or_assign(const key_type &key,
+                                             Mapped &&obj) {
+    return insertOrAssignImpl(key, std::forward<Mapped>(obj));
+  }
+
+  template <typename Mapped>
+  std::pair<iterator, bool> insert_or_assign(key_type &&key, Mapped &&obj) {
+    return insertOrAssignImpl(std::move(key), std::forward<Mapped>(obj));
+  }
+
+  template <typename Mapped>
+  std::pair<iterator, bool> insert_or_assign(const_iterator hint,
+                                             const key_type &key,
+                                             Mapped &&obj) {
+    (void)hint;
+    return insertOrAssignImpl(key, std::forward<Mapped>(obj));
+  }
+
+  template <typename Mapped>
+  std::pair<iterator, bool> insert_or_assign(const_iterator hint,
+                                             key_type &&key, Mapped &&obj) {
+    (void)hint;
+    return insertOrAssignImpl(std::move(key), std::forward<Mapped>(obj));
+  }
+
+  std::pair<iterator, bool> insert(const value_type &keyval) {
+    ROBIN_HOOD_TRACE(this)
+    return emplace(keyval);
+  }
+
+  std::pair<iterator, bool> insert(value_type &&keyval) {
+    return emplace(std::move(keyval));
+  }
+
+  // Returns 1 if key is found, 0 otherwise.
+  size_t count(const key_type &key) const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    auto kv = mKeyVals + findIdx(key);
+    if (kv != reinterpret_cast_no_cast_align_warning<Node *>(mInfo)) {
+      return 1;
+    }
+    return 0;
+  }
+
+  template <typename OtherKey, typename Self_ = Self>
+  // NOLINTNEXTLINE(modernize-use-nodiscard)
+  typename std::enable_if<Self_::is_transparent, size_t>::type count(
+      const OtherKey &key) const {
+    ROBIN_HOOD_TRACE(this)
+    auto kv = mKeyVals + findIdx(key);
+    if (kv != reinterpret_cast_no_cast_align_warning<Node *>(mInfo)) {
+      return 1;
+    }
+    return 0;
+  }
+
+  bool contains(const key_type &key) const {  // NOLINT(modernize-use-nodiscard)
+    return 1U == count(key);
+  }
+
+  template <typename OtherKey, typename Self_ = Self>
+  // NOLINTNEXTLINE(modernize-use-nodiscard)
+  typename std::enable_if<Self_::is_transparent, bool>::type contains(
+      const OtherKey &key) const {
+    return 1U == count(key);
+  }
+
+  // Returns a reference to the value found for key.
+  // Throws std::out_of_range if element cannot be found
+  template <typename Q = mapped_type>
+  // NOLINTNEXTLINE(modernize-use-nodiscard)
+  typename std::enable_if<!std::is_void<Q>::value, Q &>::type at(
+      key_type const &key) {
+    ROBIN_HOOD_TRACE(this)
+    auto kv = mKeyVals + findIdx(key);
+    if (kv == reinterpret_cast_no_cast_align_warning<Node *>(mInfo)) {
+      doThrow<std::out_of_range>("key not found");
+    }
+    return kv->getSecond();
+  }
+
+  // Returns a reference to the value found for key.
+  // Throws std::out_of_range if element cannot be found
+  template <typename Q = mapped_type>
+  // NOLINTNEXTLINE(modernize-use-nodiscard)
+  typename std::enable_if<!std::is_void<Q>::value, Q const &>::type at(
+      key_type const &key) const {
+    ROBIN_HOOD_TRACE(this)
+    auto kv = mKeyVals + findIdx(key);
+    if (kv == reinterpret_cast_no_cast_align_warning<Node *>(mInfo)) {
+      doThrow<std::out_of_range>("key not found");
+    }
+    return kv->getSecond();
+  }
+
+  const_iterator find(
+      const key_type &key) const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return const_iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  template <typename OtherKey>
+  const_iterator find(const OtherKey &key,
+                      is_transparent_tag /*unused*/) const {
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return const_iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  template <typename OtherKey, typename Self_ = Self>
+  typename std::enable_if<
+      Self_::is_transparent,             // NOLINT(modernize-use-nodiscard)
+      const_iterator>::type              // NOLINT(modernize-use-nodiscard)
+      find(const OtherKey &key) const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return const_iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  iterator find(const key_type &key) {
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  template <typename OtherKey>
+  iterator find(const OtherKey &key, is_transparent_tag /*unused*/) {
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  template <typename OtherKey, typename Self_ = Self>
+  typename std::enable_if<Self_::is_transparent, iterator>::type find(
+      const OtherKey &key) {
+    ROBIN_HOOD_TRACE(this)
+    const size_t idx = findIdx(key);
+    return iterator{mKeyVals + idx, mInfo + idx};
+  }
+
+  iterator begin() {
+    ROBIN_HOOD_TRACE(this)
+    if (empty()) {
+      return end();
+    }
+    return iterator(mKeyVals, mInfo, fast_forward_tag{});
+  }
+  const_iterator begin() const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return cbegin();
+  }
+  const_iterator cbegin() const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    if (empty()) {
+      return cend();
+    }
+    return const_iterator(mKeyVals, mInfo, fast_forward_tag{});
+  }
+
+  iterator end() {
+    ROBIN_HOOD_TRACE(this)
+    // no need to supply valid info pointer: end() must not be dereferenced, and
+    // only node
+    // pointer is compared.
+    return iterator{reinterpret_cast_no_cast_align_warning<Node *>(mInfo),
+                    nullptr};
+  }
+  const_iterator end() const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return cend();
+  }
+  const_iterator cend() const {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return const_iterator{reinterpret_cast_no_cast_align_warning<Node *>(mInfo),
+                          nullptr};
+  }
+
+  iterator erase(const_iterator pos) {
+    ROBIN_HOOD_TRACE(this)
+    // its safe to perform const cast here
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
+    return erase(iterator{const_cast<Node *>(pos.mKeyVals),
+                          const_cast<uint8_t *>(pos.mInfo)});
+  }
+
+  // Erases element at pos, returns iterator to the next element.
+  iterator erase(iterator pos) {
+    ROBIN_HOOD_TRACE(this)
+    // we assume that pos always points to a valid entry, and not end().
+    auto const idx = static_cast<size_t>(pos.mKeyVals - mKeyVals);
+
+    shiftDown(idx);
+    --mNumElements;
+
+    if (*pos.mInfo) {
+      // we've backward shifted, return this again
+      return pos;
+    }
+
+    // no backward shift, return next element
+    return ++pos;
+  }
+
+  size_t erase(const key_type &key) {
+    ROBIN_HOOD_TRACE(this)
+    size_t idx{};
+    InfoType info{};
+    keyToIdx(key, &idx, &info);
+
+    // check while info matches with the source idx
+    do {
+      if (info == mInfo[idx] &&
+          WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) {
+        shiftDown(idx);
+        --mNumElements;
+        return 1;
+      }
+      next(&info, &idx);
+    } while (info <= mInfo[idx]);
+
+    // nothing found to delete
+    return 0;
+  }
+
+  // reserves space for the specified number of elements. Makes sure the old
+  // data fits.
+  // exactly the same as reserve(c).
+  void rehash(size_t c) {
+    // forces a reserve
+    reserve(c, true);
+  }
+
+  // reserves space for the specified number of elements. Makes sure the old
+  // data fits.
+  // Exactly the same as rehash(c). Use rehash(0) to shrink to fit.
+  void reserve(size_t c) {
+    // reserve, but don't force rehash
+    reserve(c, false);
+  }
+
+  // If possible reallocates the map to a smaller one. This frees the underlying
+  // table.
+  // Does not do anything if load_factor is too large for decreasing the table's
+  // size.
+  void compact() {
+    ROBIN_HOOD_TRACE(this)
+    auto newSize = InitialNumElements;
+    while (calcMaxNumElementsAllowed(newSize) < mNumElements && newSize != 0) {
+      newSize *= 2;
+    }
+    if (ROBIN_HOOD_UNLIKELY(newSize == 0)) {
+      throwOverflowError();
+    }
+
+    ROBIN_HOOD_LOG("newSize > mMask + 1: " << newSize << " > " << mMask
+                                           << " + 1")
+
+    // only actually do anything when the new size is bigger than the old one.
+    // This prevents to
+    // continuously allocate for each reserve() call.
+    if (newSize < mMask + 1) {
+      rehashPowerOfTwo(newSize, true);
+    }
+  }
+
+  size_type size() const noexcept {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return mNumElements;
+  }
+
+  size_type max_size() const noexcept {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return static_cast<size_type>(-1);
+  }
+
+  ROBIN_HOOD(NODISCARD) bool empty() const noexcept {
+    ROBIN_HOOD_TRACE(this)
+    return 0 == mNumElements;
+  }
+
+  float max_load_factor() const noexcept {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return MaxLoadFactor100 / 100.0F;
+  }
+
+  // Average number of elements per bucket. Since we allow only 1 per bucket
+  float load_factor() const noexcept {  // NOLINT(modernize-use-nodiscard)
+    ROBIN_HOOD_TRACE(this)
+    return static_cast<float>(size()) / static_cast<float>(mMask + 1);
+  }
+
+  ROBIN_HOOD(NODISCARD) size_t mask() const noexcept {
+    ROBIN_HOOD_TRACE(this)
+    return mMask;
+  }
+
+  ROBIN_HOOD(NODISCARD)
+  size_t calcMaxNumElementsAllowed(size_t maxElements) const noexcept {
+    if (ROBIN_HOOD_LIKELY(maxElements <=
+                          (std::numeric_limits<size_t>::max)() / 100)) {
+      return maxElements * MaxLoadFactor100 / 100;
+    }
+
+    // we might be a bit inprecise, but since maxElements is quite large that
+    // doesn't matter
+    return (maxElements / 100) * MaxLoadFactor100;
+  }
+
+  ROBIN_HOOD(NODISCARD)
+  size_t calcNumBytesInfo(size_t numElements) const noexcept {
+    // we add a uint64_t, which houses the sentinel (first byte) and padding so
+    // we can load
+    // 64bit types.
+    return numElements + sizeof(uint64_t);
+  }
+
+  ROBIN_HOOD(NODISCARD)
+  size_t calcNumElementsWithBuffer(size_t numElements) const noexcept {
+    auto maxNumElementsAllowed = calcMaxNumElementsAllowed(numElements);
+    return numElements +
+           (std::min)(maxNumElementsAllowed, (static_cast<size_t>(0xFF)));
+  }
+
+  // calculation only allowed for 2^n values
+  ROBIN_HOOD(NODISCARD) size_t calcNumBytesTotal(size_t numElements) const {
+#if ROBIN_HOOD(BITNESS) == 64
+    return numElements * sizeof(Node) + calcNumBytesInfo(numElements);
+#else
+    // make sure we're doing 64bit operations, so we are at least safe against
+    // 32bit overflows.
+    auto const ne = static_cast<uint64_t>(numElements);
+    auto const s = static_cast<uint64_t>(sizeof(Node));
+    auto const infos = static_cast<uint64_t>(calcNumBytesInfo(numElements));
+
+    auto const total64 = ne * s + infos;
+    auto const total = static_cast<size_t>(total64);
+
+    if (ROBIN_HOOD_UNLIKELY(static_cast<uint64_t>(total) != total64)) {
+      throwOverflowError();
+    }
+    return total;
+#endif
+  }
+
+ private:
+  template <typename Q = mapped_type>
+  ROBIN_HOOD(NODISCARD)
+  typename std::enable_if<!std::is_void<Q>::value, bool>::type
+      has(const value_type &e) const {
+    ROBIN_HOOD_TRACE(this)
+    auto it = find(e.first);
+    return it != end() && it->second == e.second;
+  }
+
+  template <typename Q = mapped_type>
+  ROBIN_HOOD(NODISCARD)
+  typename std::enable_if<std::is_void<Q>::value, bool>::type
+      has(const value_type &e) const {
+    ROBIN_HOOD_TRACE(this)
+    return find(e) != end();
+  }
+
+  void reserve(size_t c, bool forceRehash) {
+    ROBIN_HOOD_TRACE(this)
+    auto const minElementsAllowed = (std::max)(c, mNumElements);
+    auto newSize = InitialNumElements;
+    while (calcMaxNumElementsAllowed(newSize) < minElementsAllowed &&
+           newSize != 0) {
+      newSize *= 2;
+    }
+    if (ROBIN_HOOD_UNLIKELY(newSize == 0)) {
+      throwOverflowError();
+    }
+
+    ROBIN_HOOD_LOG("newSize > mMask + 1: " << newSize << " > " << mMask
+                                           << " + 1")
+
+    // only actually do anything when the new size is bigger than the old one.
+    // This prevents to
+    // continuously allocate for each reserve() call.
+    if (forceRehash || newSize > mMask + 1) {
+      rehashPowerOfTwo(newSize, false);
+    }
+  }
+
+  // reserves space for at least the specified number of elements.
+  // only works if numBuckets if power of two
+  // True on success, false otherwise
+  void rehashPowerOfTwo(size_t numBuckets, bool forceFree) {
+    ROBIN_HOOD_TRACE(this)
+
+    Node *const oldKeyVals = mKeyVals;
+    uint8_t const *const oldInfo = mInfo;
+
+    const size_t oldMaxElementsWithBuffer =
+        calcNumElementsWithBuffer(mMask + 1);
+
+    // resize operation: move stuff
+    initData(numBuckets);
+    if (oldMaxElementsWithBuffer > 1) {
+      for (size_t i = 0; i < oldMaxElementsWithBuffer; ++i) {
+        if (oldInfo[i] != 0) {
+          // might throw an exception, which is really bad since we are in the
+          // middle of
+          // moving stuff.
+          insert_move(std::move(oldKeyVals[i]));
+          // destroy the node but DON'T destroy the data.
+          oldKeyVals[i].~Node();
+        }
+      }
+
+      // this check is not necessary as it's guarded by the previous if, but it
+      // helps
+      // silence g++'s overeager "attempt to free a non-heap object 'map'
+      // [-Werror=free-nonheap-object]" warning.
+      if (oldKeyVals !=
+          reinterpret_cast_no_cast_align_warning<Node *>(&mMask)) {
+        // don't destroy old data: put it into the pool instead
+        if (forceFree) {
+          std::free(oldKeyVals);
+        } else {
+          DataPool::addOrFree(oldKeyVals,
+                              calcNumBytesTotal(oldMaxElementsWithBuffer));
+        }
+      }
+    }
+  }
+
+  ROBIN_HOOD(NOINLINE) void throwOverflowError() const {
+#if ROBIN_HOOD(HAS_EXCEPTIONS)
+    throw std::overflow_error("robin_hood::map overflow");
+#else
+    abort();
+#endif
+  }
+
+  template <typename OtherKey, typename... Args>
+  std::pair<iterator, bool> try_emplace_impl(OtherKey &&key, Args &&... args) {
+    ROBIN_HOOD_TRACE(this)
+    auto idxAndState = insertKeyPrepareEmptySpot(key);
+    switch (idxAndState.second) {
+      case InsertionState::key_found:
+        break;
+
+      case InsertionState::new_node:
+        ::new (static_cast<void *>(&mKeyVals[idxAndState.first]))
+            Node(*this, std::piecewise_construct,
+                 std::forward_as_tuple(std::forward<OtherKey>(key)),
+                 std::forward_as_tuple(std::forward<Args>(args)...));
+        break;
+
+      case InsertionState::overwrite_node:
+        mKeyVals[idxAndState.first] =
+            Node(*this, std::piecewise_construct,
+                 std::forward_as_tuple(std::forward<OtherKey>(key)),
+                 std::forward_as_tuple(std::forward<Args>(args)...));
+        break;
+
+      case InsertionState::overflow_error:
+        throwOverflowError();
+        break;
+    }
+
+    return std::make_pair(
+        iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first),
+        InsertionState::key_found != idxAndState.second);
+  }
+
+  template <typename OtherKey, typename Mapped>
+  std::pair<iterator, bool> insertOrAssignImpl(OtherKey &&key, Mapped &&obj) {
+    ROBIN_HOOD_TRACE(this)
+    auto idxAndState = insertKeyPrepareEmptySpot(key);
+    switch (idxAndState.second) {
+      case InsertionState::key_found:
+        mKeyVals[idxAndState.first].getSecond() = std::forward<Mapped>(obj);
+        break;
+
+      case InsertionState::new_node:
+        ::new (static_cast<void *>(&mKeyVals[idxAndState.first]))
+            Node(*this, std::piecewise_construct,
+                 std::forward_as_tuple(std::forward<OtherKey>(key)),
+                 std::forward_as_tuple(std::forward<Mapped>(obj)));
+        break;
+
+      case InsertionState::overwrite_node:
+        mKeyVals[idxAndState.first] =
+            Node(*this, std::piecewise_construct,
+                 std::forward_as_tuple(std::forward<OtherKey>(key)),
+                 std::forward_as_tuple(std::forward<Mapped>(obj)));
+        break;
+
+      case InsertionState::overflow_error:
+        throwOverflowError();
+        break;
+    }
+
+    return std::make_pair(
+        iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first),
+        InsertionState::key_found != idxAndState.second);
+  }
+
+  void initData(size_t max_elements) {
+    mNumElements = 0;
+    mMask = max_elements - 1;
+    mMaxNumElementsAllowed = calcMaxNumElementsAllowed(max_elements);
+
+    auto const numElementsWithBuffer = calcNumElementsWithBuffer(max_elements);
+
+    // calloc also zeroes everything
+    auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer);
+    ROBIN_HOOD_LOG("std::calloc " << numBytesTotal << " = calcNumBytesTotal("
+                                  << numElementsWithBuffer << ")")
+    mKeyVals = reinterpret_cast<Node *>(
+        detail::assertNotNull<std::bad_alloc>(std::calloc(1, numBytesTotal)));
+    mInfo = reinterpret_cast<uint8_t *>(mKeyVals + numElementsWithBuffer);
+
+    // set sentinel
+    mInfo[numElementsWithBuffer] = 1;
+
+    mInfoInc = InitialInfoInc;
+    mInfoHashShift = InitialInfoHashShift;
+  }
+
+  enum class InsertionState {
+    overflow_error,
+    key_found,
+    new_node,
+    overwrite_node
+  };
+
+  // Finds key, and if not already present prepares a spot where to pot the key
+  // & value.
+  // This potentially shifts nodes out of the way, updates mInfo and number of
+  // inserted
+  // elements, so the only operation left to do is create/assign a new node at
+  // that spot.
+  template <typename OtherKey>
+  std::pair<size_t, InsertionState> insertKeyPrepareEmptySpot(OtherKey &&key) {
+    for (int i = 0; i < 256; ++i) {
+      size_t idx{};
+      InfoType info{};
+      keyToIdx(key, &idx, &info);
+      nextWhileLess(&info, &idx);
+
+      // while we potentially have a match
+      while (info == mInfo[idx]) {
+        if (WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) {
+          // key already exists, do NOT insert.
+          // see http://en.cppreference.com/w/cpp/container/unordered_map/insert
+          return std::make_pair(idx, InsertionState::key_found);
+        }
+        next(&info, &idx);
+      }
+
+      // unlikely that this evaluates to true
+      if (ROBIN_HOOD_UNLIKELY(mNumElements >= mMaxNumElementsAllowed)) {
+        if (!increase_size()) {
+          return std::make_pair(size_t(0), InsertionState::overflow_error);
+        }
+        continue;
+      }
+
+      // key not found, so we are now exactly where we want to insert it.
+      auto const insertion_idx = idx;
+      auto const insertion_info = info;
+      if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) {
+        mMaxNumElementsAllowed = 0;
+      }
+
+      // find an empty spot
+      while (0 != mInfo[idx]) {
+        next(&info, &idx);
+      }
+
+      if (idx != insertion_idx) {
+        shiftUp(idx, insertion_idx);
+      }
+      // put at empty spot
+      mInfo[insertion_idx] = static_cast<uint8_t>(insertion_info);
+      ++mNumElements;
+      return std::make_pair(
+          insertion_idx, idx == insertion_idx ? InsertionState::new_node
+                                              : InsertionState::overwrite_node);
+    }
+
+    // enough attempts failed, so finally give up.
+    return std::make_pair(size_t(0), InsertionState::overflow_error);
+  }
+
+  bool try_increase_info() {
+    ROBIN_HOOD_LOG("mInfoInc=" << mInfoInc << ", numElements=" << mNumElements
+                               << ", maxNumElementsAllowed="
+                               << calcMaxNumElementsAllowed(mMask + 1))
+    if (mInfoInc <= 2) {
+      // need to be > 2 so that shift works (otherwise undefined behavior!)
+      return false;
+    }
+    // we got space left, try to make info smaller
+    mInfoInc = static_cast<uint8_t>(mInfoInc >> 1U);
+
+    // remove one bit of the hash, leaving more space for the distance info.
+    // This is extremely fast because we can operate on 8 bytes at once.
+    ++mInfoHashShift;
+    auto const numElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1);
+
+    for (size_t i = 0; i < numElementsWithBuffer; i += 8) {
+      auto val = unaligned_load<uint64_t>(mInfo + i);
+      val = (val >> 1U) & UINT64_C(0x7f7f7f7f7f7f7f7f);
+      std::memcpy(mInfo + i, &val, sizeof(val));
+    }
+    // update sentinel, which might have been cleared out!
+    mInfo[numElementsWithBuffer] = 1;
+
+    mMaxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1);
+    return true;
+  }
+
+  // True if resize was possible, false otherwise
+  bool increase_size() {
+    // nothing allocated yet? just allocate InitialNumElements
+    if (0 == mMask) {
+      initData(InitialNumElements);
+      return true;
+    }
+
+    auto const maxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1);
+    if (mNumElements < maxNumElementsAllowed && try_increase_info()) {
+      return true;
+    }
+
+    ROBIN_HOOD_LOG("mNumElements="
+                   << mNumElements
+                   << ", maxNumElementsAllowed=" << maxNumElementsAllowed
+                   << ", load=" << (static_cast<double>(mNumElements) * 100.0 /
+                                    (static_cast<double>(mMask) + 1)))
+
+    nextHashMultiplier();
+    if (mNumElements * 2 < calcMaxNumElementsAllowed(mMask + 1)) {
+      // we have to resize, even though there would still be plenty of space
+      // left!
+      // Try to rehash instead. Delete freed memory so we don't steadyily
+      // increase mem in case
+      // we have to rehash a few times
+      rehashPowerOfTwo(mMask + 1, true);
+    } else {
+      // Each resize use a different hash so we don't so easily overflow.
+      // Make sure we only have odd numbers, so that the multiplication is
+      // reversible!
+      rehashPowerOfTwo((mMask + 1) * 2, false);
+    }
+    return true;
+  }
+
+  void nextHashMultiplier() {
+    // adding an *even* number, so that the multiplier will always stay odd.
+    // This is necessary
+    // so that the hash stays a mixing function (and thus doesn't have any
+    // information loss).
+    mHashMultiplier += UINT64_C(0xc4ceb9fe1a85ec54);
+  }
+
+  void destroy() {
+    if (0 == mMask) {
+      // don't deallocate!
+      return;
+    }
+
+    Destroyer<Self, IsFlat && std::is_trivially_destructible<Node>::value>{}
+        .nodesDoNotDeallocate(*this);
+
+    // This protection against not deleting mMask shouldn't be needed as it's
+    // sufficiently
+    // protected with the 0==mMask check, but I have this anyways because g++ 7
+    // otherwise
+    // reports a compile error: attempt to free a non-heap object 'fm'
+    // [-Werror=free-nonheap-object]
+    if (mKeyVals != reinterpret_cast_no_cast_align_warning<Node *>(&mMask)) {
+      ROBIN_HOOD_LOG("std::free")
+      std::free(mKeyVals);
+    }
+  }
+
+  void init() noexcept {
+    mKeyVals = reinterpret_cast_no_cast_align_warning<Node *>(&mMask);
+    mInfo = reinterpret_cast<uint8_t *>(&mMask);
+    mNumElements = 0;
+    mMask = 0;
+    mMaxNumElementsAllowed = 0;
+    mInfoInc = InitialInfoInc;
+    mInfoHashShift = InitialInfoHashShift;
+  }
+
+  // members are sorted so no padding occurs
+  uint64_t mHashMultiplier = UINT64_C(0xc4ceb9fe1a85ec53);  // 8 byte  8
+  Node *mKeyVals =
+      reinterpret_cast_no_cast_align_warning<Node *>(&mMask);  // 8 byte 16
+  uint8_t *mInfo = reinterpret_cast<uint8_t *>(&mMask);        // 8 byte 24
+  size_t mNumElements = 0;                                     // 8 byte 32
+  size_t mMask = 0;                                            // 8 byte 40
+  size_t mMaxNumElementsAllowed = 0;                           // 8 byte 48
+  InfoType mInfoInc = InitialInfoInc;                          // 4 byte 52
+  InfoType mInfoHashShift = InitialInfoHashShift;              // 4 byte 56
+  // 16 byte 56 if NodeAllocator
+};
+
+}  // namespace detail
+
+// map
+
+template <typename Key, typename T, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_flat_map =
+    detail::Table<true, MaxLoadFactor100, Key, T, Hash, KeyEqual>;
+
+template <typename Key, typename T, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_node_map =
+    detail::Table<false, MaxLoadFactor100, Key, T, Hash, KeyEqual>;
+
+template <typename Key, typename T, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_map = detail::Table<
+    sizeof(robin_hood::pair<Key, T>) <= sizeof(size_t) * 6 &&
+        std::is_nothrow_move_constructible<robin_hood::pair<Key, T>>::value &&
+        std::is_nothrow_move_assignable<robin_hood::pair<Key, T>>::value,
+    MaxLoadFactor100, Key, T, Hash, KeyEqual>;
+
+// set
+
+template <typename Key, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_flat_set =
+    detail::Table<true, MaxLoadFactor100, Key, void, Hash, KeyEqual>;
+
+template <typename Key, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_node_set =
+    detail::Table<false, MaxLoadFactor100, Key, void, Hash, KeyEqual>;
+
+template <typename Key, typename Hash = hash<Key>,
+          typename KeyEqual = std::equal_to<Key>, size_t MaxLoadFactor100 = 80>
+using unordered_set =
+    detail::Table<sizeof(Key) <= sizeof(size_t) * 6 &&
+                      std::is_nothrow_move_constructible<Key>::value &&
+                      std::is_nothrow_move_assignable<Key>::value,
+                  MaxLoadFactor100, Key, void, Hash, KeyEqual>;
+
+}  // namespace robin_hood
+
+#endif
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 24bed277280839..e55fca403af3ac 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -301,8 +301,14 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         fast_threaded_ssa_graph_executor variable_helper)
 
 cc_library(executor_cache SRCS executor_cache.cc DEPS executor)
-cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS
-    conditional_block_op executor)
+if(WITH_PSCORE)
+    get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
+    cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS
+        conditional_block_op executor ${RPC_DEPS})
+else()
+    cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS
+        conditional_block_op executor)
+endif()
 cc_library(prune SRCS prune.cc DEPS framework_proto boost)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
@@ -369,36 +375,3 @@ cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
 if(WITH_TESTING AND TEST selected_rows_test)
   set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120)
 endif()
-
-##### 2.0 New custom op extension mechanism related #####
-
-# if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_
-if (WIN32)
-  set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer)
-
-  set(PADDLE_CUSTOM_OP_SRCS
-      ${CMAKE_CURRENT_SOURCE_DIR}/custom_operator.cc
-      ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_tensor.cc
-      ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_op_meta_info.cc
-      ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc)
-  set(PADDLE_CUSTOM_OP_SRCS ${PADDLE_CUSTOM_OP_SRCS} PARENT_SCOPE)
-
-  cc_library(paddle_custom_op_shared
-      SHARED SRCS ${PADDLE_CUSTOM_OP_SRCS} DEPS ${PADDLE_CUSTOM_OP_MODULES})
-
-  get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-  set_target_properties(paddle_custom_op_shared PROPERTIES OUTPUT_NAME paddle_custom_op)
-  target_link_libraries(paddle_custom_op_shared ${os_dependency_modules})
-
-  if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
-    set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR})
-  else()
-    set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE})
-  endif()
-  set(PADDLE_CUSTOM_OP_IMPORT_LIB
-      ${paddle_custom_op_lib_path}/paddle_custom_op.lib
-      CACHE INTERNAL "Paddle custom op import lib")
-  set(PADDLE_CUSTOM_OP_SHARED_LIB
-      ${paddle_custom_op_lib_path}/paddle_custom_op.dll
-      CACHE INTERNAL "Paddle custom op dll")
-endif()
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 97d58df6dc5738..c4b833ec94c294 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -246,7 +246,7 @@ class CustomOperator : public OperatorWithKernel {
    * it can only be determined at runtime.
    */
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const {
+      const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(proto::VarType::RAW, ctx.GetPlace());
   }
 
@@ -257,7 +257,7 @@ class CustomOperator : public OperatorWithKernel {
    */
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const Tensor& tensor,
-      const OpKernelType& expected_kernel_type) {
+      const OpKernelType& expected_kernel_type) const override {
     return OpKernelType(expected_kernel_type.data_type_,
                         expected_kernel_type.place_, tensor.layout());
   }
diff --git a/paddle/fluid/framework/custom_operator.h b/paddle/fluid/framework/custom_operator.h
index 117841f80cf47e..259901c09f3e00 100644
--- a/paddle/fluid/framework/custom_operator.h
+++ b/paddle/fluid/framework/custom_operator.h
@@ -28,5 +28,8 @@ void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name);
 void RegisterOperatorWithMetaInfoMap(
     const paddle::OpMetaInfoMap& op_meta_info_map);
 
+// Interface for selective register custom op.
+void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 103dd0c5ae599b..0fdb97db20af99 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -354,8 +354,36 @@ void CheckVarHasNanOrInf(const std::string& op_type,
         var_name));
 #endif
     return;
-  }
+  } else if (platform::is_npu_place(tensor->place())) {
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (tensor->type() != proto::VarType::FP32) {
+      return;
+    }
+
+    framework::LoDTensor cpu_tensor;
+    cpu_tensor.Resize(tensor->dims());
+    float* cpu_data = static_cast<float*>(
+        cpu_tensor.mutable_data(platform::CPUPlace(), tensor->type()));
 
+    framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
+    bool flag = false;
+    for (int i = 0; i < cpu_tensor.numel(); i++) {
+      if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
+        flag = true;
+        break;
+      }
+    }
+    PADDLE_ENFORCE_NE(
+        flag, true,
+        platform::errors::Fatal("Operator %s output Tensor %s contains Inf.",
+                                op_type, var_name));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Tensor[%s] use npu place. PaddlePaddle must compile with NPU.",
+        var_name));
+#endif
+    return;
+  }
   tensor_check<platform::CPUDeviceContext>(op_type, var_name, *tensor, place);
 }
 
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 628b9f0d70f598..cd5de19bdc0887 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -638,7 +638,8 @@ class PSGPUWorker : public HogwildWorker {
 };
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 class SectionWorker : public DeviceWorker {
  public:
   SectionWorker() {}
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
index a539a5d5f96b52..fb2323d96e2916 100644
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -79,7 +79,8 @@ REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker);
 REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
old mode 100755
new mode 100644
index e6a7d74cc43433..654b88920acaf6
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -139,6 +139,10 @@ message PipelineConfig {
   optional string schedule_mode = 3 [ default = '1F1B' ];
 }
 
+message TensorParallelConfig {
+  optional int32 tensor_parallel_degree = 1 [ default = 1 ];
+}
+
 message DistributedStrategy {
   // bool options
   optional Mode mode = 1 [ default = COLLECTIVE ];
@@ -169,6 +173,7 @@ message DistributedStrategy {
   optional bool sharding = 26 [ default = false ];
   optional float last_comm_group_size_MB = 27 [ default = 1 ];
   optional bool find_unused_parameters = 28 [ default = true ];
+  optional bool tensor_parallel = 29 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
@@ -182,6 +187,7 @@ message DistributedStrategy {
   optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110;
   optional ShardingConfig sharding_configs = 111;
   optional HybridConfig hybrid_configs = 112;
+  optional TensorParallelConfig tensor_parallel_configs = 113;
   optional BuildStrategy build_strategy = 201;
   optional ExecutionStrategy execution_strategy = 202;
 }
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 101991d2c1ba00..de007c128d7543 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -72,7 +72,7 @@ Executor::~Executor() {
 #ifdef PADDLE_WITH_MKLDNN
   // Clear mkl-dnn cache,
   // this is needed to have mkl-dnn unit tests working
-  ClearMKLDNNCache(place_);
+  ClearMKLDNNCache(place_, this);
 #endif
 }
 
@@ -169,6 +169,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
                    bool force_disable_gc, bool keep_kid_scopes) {
   platform::RecordBlock b(block_id);
   if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
+#ifdef PADDLE_WITH_MKLDNN
+  platform::AttachPointerHashToMKLDNNKey(this, place_);
+#endif
   auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
   RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars,
                      keep_kid_scopes);
@@ -294,6 +297,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
                    const std::string& fetch_holder_name) {
   platform::RecordBlock b(kProgramId);
   if (FLAGS_use_mkldnn) EnableMKLDNN(program);
+#ifdef PADDLE_WITH_MKLDNN
+  platform::AttachPointerHashToMKLDNNKey(this, place_);
+#endif
   bool has_feed_ops =
       has_feed_operators(program.Block(0), *feed_targets, feed_holder_name);
   bool has_fetch_ops =
@@ -456,11 +462,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
 #endif
     } else if (platform::is_npu_place(place_)) {
 #ifdef PADDLE_WITH_ASCEND_CL
-      // TODO(ascendrc): Support garbage collector on NPUPlace
-      VLOG(4) << "Skip NPU gc because it is not implemented now.";
+      if (IsFastEagerDeletionModeEnabled()) {
+        VLOG(4) << "Use unsafe fast gc for NPU.";
+        gc.reset(new NPUUnsafeFastGarbageCollector(
+            BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Please set FLAGS_fast_eager_deletion_mode=true to use "
+            "GarbageCollector on NPU."));
+        // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
+        VLOG(4) << "Use default stream gc for NPU.";
+        gc.reset(new NPUDefaultStreamGarbageCollector(
+            BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size));
+      }
 #else
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "No NPU gc found in CPU/GPU/XPU paddle"));
+      PADDLE_THROW(
+          platform::errors::Unimplemented("No NPU gc found in CPU/NPU paddle"));
 #endif
     }
   }
@@ -565,7 +582,6 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) {
       }
     }
   }
-  platform::AttachPointerHashToMKLDNNKey(this, place_);
 #else
   LOG(WARNING)
       << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index c8517b9503741b..03dd2cff655c06 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -43,6 +43,6 @@ cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_conte
 
 cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
 
-if(WITH_ASCEND)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
     cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph)
-endif(WITH_ASCEND)
+endif()
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.cc b/paddle/fluid/framework/fleet/ascend_wrapper.cc
index d1b2f51f700363..273939f6bee613 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h
index baa2fd126a4b77..f749ee8cfa0baa 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include <glog/logging.h>
 
 #include <map>
@@ -29,7 +29,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/timer.h"
 
 #include "ge/ge_api.h"
-#include "ge/ge_api_types.h"
 #include "graph/attr_value.h"
 #include "graph/tensor.h"
 #include "graph/types.h"
diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h
index 6f063e830c2da7..1fb2f0fab4aff9 100644
--- a/paddle/fluid/framework/fleet/heter_context.h
+++ b/paddle/fluid/framework/fleet/heter_context.h
@@ -77,6 +77,21 @@ class HeterContext {
       mutex_[i] = new std::mutex();
     }
   }
+
+  void Reset() {
+    for (size_t i = 0; i < feature_keys_.size(); ++i) {
+      feature_keys_[i].clear();
+    }
+    for (size_t i = 0; i < value_ptr_.size(); ++i) {
+      value_ptr_[i].clear();
+    }
+    for (size_t i = 0; i < device_values_.size(); ++i) {
+      device_values_[i].clear();
+    }
+    for (size_t i = 0; i < device_keys_.size(); ++i) {
+      device_keys_[i].clear();
+    }
+  }
   void batch_add_keys(
       const std::vector<std::unordered_set<uint64_t>>& thread_keys) {
     assert(thread_keys.size() == feature_keys_.size());
@@ -90,6 +105,15 @@ class HeterContext {
     }
   }
 
+  void batch_add_keys(int shard_num,
+                      const std::unordered_set<uint64_t>& shard_keys) {
+    int idx = feature_keys_[shard_num].size();
+    feature_keys_[shard_num].resize(feature_keys_[shard_num].size() +
+                                    shard_keys.size());
+    std::copy(shard_keys.begin(), shard_keys.end(),
+              feature_keys_[shard_num].begin() + idx);
+  }
+
   void UniqueKeys() {
     std::vector<std::thread> threads;
     auto unique_func = [this](int i) {
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 6df2cd52bb401d..67c44368b7aea4 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -1,5 +1,13 @@
 IF(WITH_GPU)
-    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
+    SET(HETERPS_DEPS device_context)
+    if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+        SET(HETERPS_DEPS ${HETERPS_DEPS} cub)
+    endif()
+    if(WITH_PSCORE)
+        get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
+        SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS})
+    endif()
+    nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS ${HETERPS_DEPS})
     nv_test(test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
 ENDIF()
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index b7bb5110744649..67ff6b6acaefb2 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -103,12 +103,26 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
 
   timeline.Start();
 
+  threads.clear();
   // merge thread_keys to shard_keys
-  for (size_t i = 0; i < thread_keys_.size(); i++) {
-    gpu_task->batch_add_keys(thread_keys_[i]);
-    for (int j = 0; j < thread_keys_thread_num_; j++) {
-      thread_keys_[i][j].clear();
+  auto merge_ins_func = [this, gpu_task](int shard_num) {
+    for (int i = 0; i < thread_keys_thread_num_; ++i) {
+      gpu_task->batch_add_keys(shard_num, thread_keys_[i][shard_num]);
+      thread_keys_[i][shard_num].clear();
     }
+  };
+
+  // for (size_t i = 0; i < thread_keys_.size(); i++) {
+  //  gpu_task->batch_add_keys(thread_keys_[i]);
+  //  for (int j = 0; j < thread_keys_thread_num_; j++) {
+  //    thread_keys_[i][j].clear();
+  //  }
+  //}
+  for (int i = 0; i < thread_keys_shard_num_; ++i) {
+    threads.push_back(std::thread(merge_ins_func, i));
+  }
+  for (auto& t : threads) {
+    t.join();
   }
   timeline.Pause();
 
@@ -261,6 +275,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task,
 void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) {
   int device_num = heter_devices_.size();
   std::shared_ptr<HeterContext> gpu_task = gpu_task_pool_.Get();
+  gpu_task->Reset();
   BuildTask(gpu_task, table_id, feature_dim);
   platform::Timer timeline;
   timeline.Start();
@@ -273,8 +288,8 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) {
     size_max = std::max(size_max, feature_keys_count[i]);
   }
   if (HeterPs_) {
-    HeterPs_->show_one_table(0);
-    return;
+    delete HeterPs_;
+    HeterPs_ = nullptr;
   }
   std::vector<std::thread> threads(device_num);
   HeterPs_ = HeterPsBase::get_instance(size_max, resource_);
@@ -295,6 +310,7 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) {
   timeline.Pause();
   VLOG(1) << "GpuPs build table total costs: " << timeline.ElapsedSec()
           << " s.";
+  gpu_task_pool_.Push(gpu_task);
 }
 
 void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 8dfbd3c268b866..9ab6b5d8c178b9 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -122,6 +122,32 @@ void CUDAPinnedGarbageCollector::ClearCallback(
 }
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+NPUDefaultStreamGarbageCollector::NPUDefaultStreamGarbageCollector(
+    const platform::NPUPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void NPUDefaultStreamGarbageCollector::Wait() const {
+  static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
+      ->WaitStreamCallback();
+}
+
+void NPUDefaultStreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  static_cast<platform::NPUDeviceContext *>(this->dev_ctx_)
+      ->AddStreamCallback(callback);
+}
+NPUUnsafeFastGarbageCollector::NPUUnsafeFastGarbageCollector(
+    const platform::NPUPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void NPUUnsafeFastGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback();
+}
+
+#endif
+
 int64_t GetEagerDeletionThreshold() {
   return FLAGS_eager_delete_tensor_gb < 0
              ? -1
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 572c79d21a045b..2c2b57bbe420a8 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -131,6 +131,28 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
 };
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+class NPUDefaultStreamGarbageCollector : public GarbageCollector {
+ public:
+  NPUDefaultStreamGarbageCollector(const platform::NPUPlace &place,
+                                   size_t max_memory_size);
+
+  void Wait() const override;
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+
+class NPUUnsafeFastGarbageCollector : public GarbageCollector {
+ public:
+  NPUUnsafeFastGarbageCollector(const platform::NPUPlace &place,
+                                size_t max_memory_size);
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+#endif
+
 template <typename Container>
 void GarbageCollector::Add(Container &&objs) {
   Add(std::forward<Container>(objs), []() {});
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 0ca78c679aecaa..ab69170322ce3e 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -86,6 +86,7 @@ pass_library(quant_conv2d_dequant_fuse_pass inference)
 pass_library(shuffle_channel_detect_pass inference)
 pass_library(delete_quant_dequant_op_pass inference)
 pass_library(delete_quant_dequant_filter_op_pass inference)
+pass_library(delete_dropout_op_pass inference)
 pass_library(simplify_with_basic_ops_pass base)
 pass_library(fc_elementwise_layernorm_fuse_pass base)
 pass_library(skip_layernorm_fuse_pass base)
diff --git a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
new file mode 100644
index 00000000000000..09962239a01b18
--- /dev/null
+++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <string>
+
+#include "paddle/fluid/framework/ir/delete_dropout_op_pass.h"
+
+namespace paddle {
+namespace framework {
+class LoDTensor;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                  \
+  GET_IR_NODE(any_op_out);         \
+  GET_IR_NODE(dropout_op);         \
+  GET_IR_NODE(dropout_op_out);     \
+  GET_IR_NODE(dropout_op_outmask); \
+  GET_IR_NODE(any_op2);
+
+void DeleteDropoutOpPass::ApplyImpl(ir::Graph* graph) const {
+  const std::string pattern_name = "delete_dropout_op_pattern";
+  FusePassBase::Init(pattern_name, graph);
+
+  GraphPatternDetector gpd;
+
+  patterns::DeleteDropoutOpPattern pattern(gpd.mutable_pattern(), pattern_name);
+  pattern();
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+    IR_NODE_LINK_TO(any_op_out, any_op2);
+    std::string any_op_out_name = any_op_out->Var()->Name();
+    std::string dropout_op_out_name = dropout_op_out->Var()->Name();
+
+    auto* any_op2_desc = any_op2->Op();
+    auto var_map = any_op2_desc->Inputs();
+    std::string arg_name = "";
+    for (auto& name_m : var_map) {
+      if (std::find(name_m.second.begin(), name_m.second.end(),
+                    dropout_op_out_name) != name_m.second.end()) {
+        arg_name = name_m.first;
+      }
+    }
+    if (arg_name.size() == 0) {
+      LOG(INFO) << "Delete dropout op pass: can not find the input "
+                << dropout_op_out_name;
+      return;
+    }
+
+    // modify the any_op2's inputs
+    for (auto& name_m : var_map) {
+      if (std::find(name_m.second.begin(), name_m.second.end(),
+                    dropout_op_out_name) != name_m.second.end()) {
+        std::vector<std::string> new_inputs;
+        for (auto& i_n : name_m.second) {
+          if (i_n != dropout_op_out_name) {
+            new_inputs.push_back(i_n);
+          }
+        }
+        new_inputs.push_back(any_op_out_name);
+        any_op2_desc->SetInput(name_m.first, new_inputs);
+        any_op2_desc->Flush();
+      }
+    }
+    any_op2_desc->Flush();
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph,
+                         {dropout_op, dropout_op_out, dropout_op_outmask});
+  };
+
+  gpd(graph, handler);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(delete_dropout_op_pass,
+              paddle::framework::ir::DeleteDropoutOpPass);
diff --git a/paddle/fluid/framework/ir/delete_dropout_op_pass.h b/paddle/fluid/framework/ir/delete_dropout_op_pass.h
new file mode 100644
index 00000000000000..c49abf3c871ced
--- /dev/null
+++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Graph;
+
+class DeleteDropoutOpPass : public FusePassBase {
+ public:
+  virtual ~DeleteDropoutOpPass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
index 59d071e1034590..48f79e63b4f0ea 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -290,10 +290,20 @@ static int BuildFusion(Graph* graph, const std::string& name_scope
       ids.push_back(inner_pattern_ins[js[iter]].first->Name());
       embs.push_back(inner_pattern_ins[js[iter]].second->Name());
     }
+
     OpDesc new_op_desc;
     new_op_desc.SetType("fused_embedding_eltwise_layernorm");
     new_op_desc.SetInput("Ids", ids);
     new_op_desc.SetInput("Embs", embs);
+
+    new_op_desc.SetInput("WordId", {ids[0]});
+    new_op_desc.SetInput("PosId", {ids[1]});
+    new_op_desc.SetInput("SentId", {ids[2]});
+
+    new_op_desc.SetInput("WordEmbedding", {embs[0]});
+    new_op_desc.SetInput("PosEmbedding", {embs[1]});
+    new_op_desc.SetInput("SentEmbedding", {embs[2]});
+
     new_op_desc.SetInput("Bias", {end_pattern_biases[k]->Name()});
     new_op_desc.SetInput("Scale", {end_pattern_scales[k]->Name()});
     new_op_desc.SetOutput("Out", {end_pattern_out[k]->Name()});
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index d74e8e5f65cd20..064da3d941602e 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2439,6 +2439,29 @@ PDNode *patterns::TransposeFlattenConcat::operator()(
   return concat_out;
 }
 
+void patterns::DeleteDropoutOpPattern::operator()() {
+  auto any_op_out = pattern->NewNode(any_op_out_repr())
+                        ->assert_is_op_input("dropout", "X")
+                        ->AsInput();
+
+  auto dropout_op =
+      pattern->NewNode(dropout_op_repr())->assert_is_op("dropout");
+
+  auto dropout_op_out = pattern->NewNode(dropout_op_out_repr())
+                            ->assert_is_op_output("dropout", "Out")
+                            ->AsIntermediate();
+
+  auto dropout_op_outmask = pattern->NewNode(dropout_op_outmask_repr())
+                                ->assert_is_op_output("dropout", "Mask")
+                                ->AsOutput();
+  auto any_op2 = pattern->NewNode(any_op2_repr())->assert_is_op()->AsOutput();
+
+  dropout_op->LinksFrom({any_op_out});
+  dropout_op_out->LinksFrom({dropout_op});
+  dropout_op_outmask->LinksFrom({dropout_op});
+  any_op2->LinksFrom({dropout_op_out});
+}
+
 void patterns::DeleteQuantOpFuse::operator()(PDNode *input_act_node,
                                              const std::string &quant_type) {
   auto *input_scale_node = pattern->NewNode(GetNodeName("input_scale_node"))
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index cfac01ec9dedc8..13f65859954d58 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1464,6 +1464,19 @@ struct ShuffleChannelPattern : public PatternBase {
   PATTERN_DECL_NODE(reshape2_out);
 };
 
+struct DeleteDropoutOpPattern : public PatternBase {
+  DeleteDropoutOpPattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "delete_dropout_op_pattern") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(any_op_out);
+  PATTERN_DECL_NODE(dropout_op);
+  PATTERN_DECL_NODE(dropout_op_out);
+  PATTERN_DECL_NODE(dropout_op_outmask);
+  PATTERN_DECL_NODE(any_op2);
+};
+
 struct DeleteQuantDequantOpPattern : public PatternBase {
   DeleteQuantDequantOpPattern(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "delete_quantdequant_op_pattern") {}
diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc
index 0a70440765d44d..25bf03f426a1d9 100644
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -35,7 +35,7 @@ void IsTestPass::ApplyImpl(ir::Graph* graph) const {
                   "hard_shrink", "hard_sigmoid", "relu6",
                   "soft_relu",   "swish",        "thresholded_relu",
                   "log",         "square",       "softplus",
-                  "softsign"};
+                  "softsign",    "silu"};
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
       auto* op = n->Op();
diff --git a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
index 06df1caca35b92..4eb532b47cb4b5 100644
--- a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc
@@ -43,8 +43,9 @@ void InterpolateMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
 
   int found_count = 0;
   const std::vector<std::string> interpolate_op_types = {
-      "bilinear_interp", "nearest_interp", "trilinear_interp", "bicubic_interp",
-      "linear_interp"};
+      "bilinear_interp",  "nearest_interp", "trilinear_interp",
+      "bicubic_interp",   "linear_interp",  "bilinear_interp_v2",
+      "nearest_interp_v2"};
 
   for (const Node* node : graph->Nodes()) {
     if (node->IsOp() &&
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index 1e8349e878781d..57bee20247c964 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -753,7 +753,7 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() {
       pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
   auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
                                    ->assert_is_op_output("transpose2");
-  transpose2_0_out_var->AsIntermediate()->assert_is_op_input("matmul");
+  transpose2_0_out_var->AsIntermediate()->assert_is_op_input("matmul", "X");
 
   auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul");
   auto* matmul_qk_out_var =
@@ -827,7 +827,7 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() {
   auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr())
                                    ->assert_is_op_output("transpose2");
   transpose2_1_out_var->AsIntermediate()->assert_is_op_input(
-      "matmul");  // link to matmul qk
+      "matmul", "Y");  // link to matmul qk
 
   // Third path to matmul
   auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("matmul");
diff --git a/paddle/fluid/framework/ir/placement_pass_base.cc b/paddle/fluid/framework/ir/placement_pass_base.cc
index fd604ffe7b5de4..35ba9200607799 100644
--- a/paddle/fluid/framework/ir/placement_pass_base.cc
+++ b/paddle/fluid/framework/ir/placement_pass_base.cc
@@ -77,7 +77,8 @@ bool PlacementPassBase::IsDefaultOpTypes(const std::string& op_type) const {
     // the corresponding pass.
     const std::vector<std::string> not_default_op_types = {
         "bilinear_interp", "nearest_interp", "trilinear_interp",
-        "bicubic_interp", "linear_interp"};
+        "bicubic_interp",  "linear_interp",  "bilinear_interp_v2",
+        "linear_interp_v2"};
     bool is_interpolate_op =
         std::find(not_default_op_types.begin(), not_default_op_types.end(),
                   op_type) != not_default_op_types.end();
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 5043fce8885cde..2fc39fd25d56c1 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -225,10 +225,13 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
                quantized_op_type == "depthwise_conv2d") {
       PADDLE_ENFORCE_EQ(
           dequant_type, "fake_channel_wise_dequantize_max_abs",
-          platform::errors::InvalidArgument("conv2d op must be dequantized by "
-                                            "[fake_channel_wise_dequantize_max_"
-                                            "abs], but got %s",
-                                            dequant_type));
+          platform::errors::InvalidArgument(
+              "conv2d op must be dequantized by "
+              "[fake_channel_wise_dequantize_max_abs], but got %s. "
+              "If you uses PaddleSlim to generate the quantized "
+              "model, please set the 'weight_quantize_type' params as "
+              "'channel_wise_abs_max' and generate the quantized model again.",
+              dequant_type));
       PADDLE_ENFORCE_EQ(
           weight_scale.size(), static_cast<size_t>(w_dims[0]),
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
index 479df876fbe007..bf59c140005167 100644
--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -54,6 +54,17 @@ static bool IsFCWithAct(Node* n, const std::string& act_type = "relu") {
   return false;
 }
 
+static bool IsFCWithPaddingWeights(Node* n) {
+  bool res = false;
+  if (n && n->IsOp() && n->Op() && n->Op()->Type() == "fc" &&
+      n->inputs.size() == 3U && n->outputs.size() == 1U) {
+    if (n->Op()->HasAttr("padding_weights")) {
+      res = BOOST_GET_CONST(bool, n->Op()->GetAttr("padding_weights"));
+    }
+  }
+  return res;
+}
+
 static bool IsParamOfFC(Node* n, const std::string& param_name) {
   if (IsInputOfFC(n) && n->inputs.empty() &&
       (n->Name() == n->outputs[0]->Op()->Input(param_name)[0])) {
@@ -255,7 +266,7 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern,
 
     fc_ops[i] = pattern->NewNode(
         [=](Node* x) {
-          if (!IsFCWithAct(x, "relu")) {
+          if (!IsFCWithAct(x, "relu") || IsFCWithPaddingWeights(x)) {
             return false;
           }
           auto* fc_out_var = x->outputs[0];
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 3a79452e230ef4..0a6b5e44452fe1 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -268,6 +268,21 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
   TensorToStream(os, static_cast<Tensor>(tensor), dev_ctx);
 }
 
+void SerializeToStream(std::ostream &os, const LoDTensor &tensor) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext *dev_ctx;
+  auto place = tensor.place();
+  dev_ctx = pool.Get(place);
+  SerializeToStream(os, tensor, *dev_ctx);
+}
+
+void DeserializeFromStream(std::ifstream &os, LoDTensor *tensor) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext *dev_ctx;
+  dev_ctx = pool.Get(platform::CPUPlace());
+  DeserializeFromStream(os, tensor, *dev_ctx);
+}
+
 void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
                            const platform::DeviceContext &dev_ctx,
                            const size_t &seek,
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index b8911154e6bf7b..6b357aba1c5f9a 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -14,16 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <glog/logging.h>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#endif
-
-#include <glog/logging.h>
 
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/mixed_vector.h"
@@ -260,5 +255,9 @@ LoD ConvertToLengthBasedLoD(const LoD& offset_lod);
 
 LoD ConvertToOffsetBasedLoD(const LoD& length_lod);
 
+void SerializeToStream(std::ostream& os, const LoDTensor& tensor);
+
+void DeserializeFromStream(std::ifstream& os, LoDTensor* tensor);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index f107321958ba7b..7d55d8c41e3e92 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -128,7 +128,7 @@ NaiveExecutor::~NaiveExecutor() {
 #ifdef PADDLE_WITH_MKLDNN
   // Clear mkl-dnn cache,
   // this is needed to have mkl-dnn unit tests working
-  ClearMKLDNNCache(place_);
+  ClearMKLDNNCache(place_, this);
 #endif
 }
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 7af5c54ceed74f..519bf8c633a013 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -447,6 +447,11 @@ void OpDesc::SetOutput(const std::string &param_name,
   this->outputs_[param_name] = args;
 }
 
+void OpDesc::RemoveOutput(const std::string &name) {
+  outputs_.erase(name);
+  need_update_ = true;
+}
+
 bool OpDesc::HasProtoAttr(const std::string &name) const {
   auto &op_info = OpInfoMap::Instance();
   if (op_info.Has(desc_.type())) {
diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 95c33bca6c7f1d..1bc1a308e453bb 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -65,6 +65,7 @@ class OpDesc {
 
   void SetOutput(const std::string &param_name,
                  const std::vector<std::string> &args);
+  void RemoveOutput(const std::string &name);
 
   bool HasAttr(const std::string &name) const {
     return attrs_.find(name) != attrs_.end();
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 4c529329761227..593d4d839fa910 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -25,7 +25,8 @@ limitations under the License. */
 #include <unordered_set>
 
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#include "glog/logging.h"               // For VLOG()
+#include "gflags/gflags.h"
+#include "glog/logging.h"  // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
@@ -67,6 +68,8 @@ class Version;
 }  // namespace framework
 }  // namespace paddle
 
+DECLARE_bool(check_kernel_launch);
+
 namespace paddle {
 namespace framework {
 
@@ -134,6 +137,19 @@ class OpRegistry {
   static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
 };
 
+template <typename PlaceType>
+inline void CheckKernelLaunch(const char* op_type) {}
+
+#ifdef PADDLE_WITH_CUDA
+template <>
+inline void CheckKernelLaunch<::paddle::platform::CUDAPlace>(
+    const char* op_type) {
+  if (FLAGS_check_kernel_launch) {
+    PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(op_type);
+  }
+}
+#endif
+
 template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
 struct OpKernelRegistrarFunctor;
 
@@ -162,8 +178,9 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
     RegisterKernelClass<PlaceType, T>(
         op_type, library_type, customized_type_value,
 
-        [](const framework::ExecutionContext& ctx) {
+        [op_type](const framework::ExecutionContext& ctx) {
           KERNEL_TYPE().Compute(ctx);
+          CheckKernelLaunch<PlaceType>(op_type);
         });
     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
     OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
@@ -223,8 +240,13 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 
   void operator()(const char* op_type, const char* library_type,
                   int customized_type_value) const {
-    RegisterKernelClass<PlaceType, T>(op_type, library_type,
-                                      customized_type_value, Functor());
+    RegisterKernelClass<PlaceType, T>(
+        op_type, library_type, customized_type_value,
+
+        [op_type](const framework::ExecutionContext& ctx) {
+          Functor()(ctx);
+          CheckKernelLaunch<PlaceType>(op_type);
+        });
 
     constexpr auto size =
         std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
@@ -343,6 +365,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
       __VA_ARGS__)
 
+#define REGISTER_OP_NPU_KERNEL_FUNCTOR(op_type, ...)                  \
+  REGISTER_OP_KERNEL_EX(                                              \
+      op_type, NPU, ::paddle::platform::NPUPlace, DEFAULT_TYPE,       \
+      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
+      __VA_ARGS__)
+
 /**
  * Macro to mark what Operator and Kernel
  * we will use and tell the compiler to
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index e624fbc237dea8..955c917b2c1bf4 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1236,7 +1236,8 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
       }
     }
   }
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+  VLOG(3) << "op type:" << type_
+          << ", expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index dd93639f319083..73a699b41c8e01 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -27,7 +27,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
@@ -631,141 +630,15 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
   InitP2P(places);
   ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                  member_->places_.size());
-  member_->use_device_ = exec_strategy.use_device_;
-  member_->build_strategy_ = build_strategy;
-  member_->use_all_reduce_ = member_->build_strategy_.reduce_ ==
-                             BuildStrategy::ReduceStrategy::kAllReduce;
-  member_->nranks_ = build_strategy.num_trainers_ * places.size();
-  if (!member_->use_all_reduce_ && member_->nranks_ == 1) {
-    LOG(INFO) << "If you set build_strategy.reduce with 'Reduce',"
-                 "the number of places should be greater than 1.";
-    member_->build_strategy_.reduce_ =
-        BuildStrategy::ReduceStrategy::kAllReduce;
-    member_->use_all_reduce_ = true;
-  }
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
-  if (member_->IsUseCUDA(member_->use_device_)) {
-    PADDLE_ENFORCE_EQ(
-        places.size(), 1,
-        platform::errors::Unavailable("Windows can support Single GPU only."));
-  }
-#endif
-
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
-    (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
-  if (member_->IsUseCUDA(member_->use_device_)) {
-    PADDLE_ENFORCE_EQ(
-        places.size(), 1,
-        platform::errors::PermissionDenied(
-            "Your machine has multiple cards, "
-            "but the WITH_NCCL option is not turned on during compilation, "
-            "and you cannot use multi-card training or prediction. "
-            "Please recompile and turn on the WITH_NCCL option."));
-  }
-#endif
-
-  std::string device_name;
-  if (member_->use_device_ == p::kCPU) {
-    device_name = "CPU";
-  } else if (member_->use_device_ == p::kCUDA) {
-    device_name = "CUDA";
-  } else {
-    device_name = "XPU";
-  }
-
-  VLOG(1) << string::Sprintf(
-      "The Program will be executed on %s using ParallelExecutor, %lu "
-      "cards are used, so %lu programs are executed in parallel.",
-      device_name, places.size(), places.size());
-
-  // Step 1. Bcast the bcast_vars to devs.
-  // Create local scopes
-  if (local_scopes.empty()) {
-    member_->own_local_scope_ = true;
-    member_->local_scopes_.emplace_back(member_->global_scope_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(&scope->NewScope());
-    }
-  } else {
-    member_->own_local_scope_ = false;
-    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size(),
-                      platform::errors::PreconditionNotMet(
-                          "member_->places_.size() = %d is not equal to "
-                          "local_scopes.size() = %d",
-                          member_->places_.size(), local_scopes.size()));
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
-    }
-  }
-
-  std::vector<ir::Graph *> graphs;
-  if (member_->build_strategy_.async_mode_) {
-    PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false,
-                      platform::errors::Unavailable(
-                          "gpu mode does not support async_mode_ now!"));
-    graphs.push_back(graph);
-    for (size_t i = 1; i < places.size(); ++i) {
-      auto *tmp_graph = new ir::Graph(graph->OriginProgram());
-      async_graphs_.emplace_back(tmp_graph);
-      graphs.push_back(tmp_graph);
-    }
-  }
-
-  // FIXME(Yancey1989): parallel graph mode get better performance
-  // in GPU allreduce distributed training. Need an elegant way to
-  // choice the execution strategy.
-  member_->build_strategy_.enable_parallel_graph_ =
-      EnableParallelGraphExecution(*graph, exec_strategy,
-                                   member_->build_strategy_);
-  if (member_->build_strategy_.enable_parallel_graph_) {
-    LOG(INFO) << "The Executor would execute the graph by ParallelGraph "
-                 "Execution which can get better performance,"
-              << "you can force it off by env FLAGS_enable_parallel_graph=0";
-  }
+  // Initialize necessary info of member_ with strategy.
+  InitExecutorPrivateMemberInfo(exec_strategy, build_strategy, places.size(),
+                                *graph);
 
-  if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_);
-
-    // Initialize device context's nccl comm, will be used by normal
-    // Operators like sync_batch_norm, and collective ops.
-    // NOTE: more than one ParallelExecutor with same place, the nccl comm will
-    // be rewrite and there will be some problem.
-    // NOTE: NCCL group-calls and non-group-calls can not use the same
-    // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
-    // same communicators.
-    auto *nccl_ctxs =
-        member_->nccl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_);
-    auto &pool = platform::DeviceContextPool::Instance();
-    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
-      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          pool.Get(member_->places_[dev_id]));
-      auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]);
-      dev_ctx->set_nccl_comm(nccl_ctx.comm());
-    }
-#else
-    PADDLE_THROW(
-        platform::errors::PreconditionNotMet("Not compiled with CUDA."));
-#endif
-  }
-  if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) {
-#if defined(PADDLE_WITH_XPU_BKCL)
-    member_->InitOrGetBKCLCommunicator(scope, member_->build_strategy_);
+  // Step 1. Create local scopes and Clone graph into multi device
+  CreateLocalScopes(scope, local_scopes, /*create_new*/ true);
+  std::vector<ir::Graph *> graphs = CloneGraphToMultiDevices(graph);
+  PrepareNCCLCommunicator(scope);
 
-    auto *bkcl_ctxs =
-        member_->bkcl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_);
-    auto &pool = platform::DeviceContextPool::Instance();
-    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
-      auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
-          pool.Get(member_->places_[dev_id]));
-      auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
-      dev_ctx->set_bkcl_context(bkcl_ctx.comm());
-    }
-#else
-    PADDLE_THROW(
-        platform::errors::PreconditionNotMet("Not compiled with XPU."));
-#endif
-  }
   // broadcast parameters from the 0th device to others:
   auto need_broadcast = [&]() -> bool {
     if (member_->build_strategy_.num_trainers_ > 1) {
@@ -778,257 +651,75 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
     }
     return false;
   };
-  // Bcast Parameters to all GPUs
   if (need_broadcast()) {
     BCastParamsToDevices(bcast_vars, member_->build_strategy_.trainer_id_);
   }
 
-  // Startup Program has been run. All local scopes has correct parameters.
-
   // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
   // ncclOp
-  std::vector<ir::Graph *> async_graphs(places.size());
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_device_,
-        member_->nccl_ctxs_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_device_,
-          member_->nccl_ctxs_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_device_, member_->nccl_ctxs_);
-  }
-#elif defined(PADDLE_WITH_XPU_BKCL)
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_device_,
-        member_->bkcl_ctxs_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_device_,
-          member_->bkcl_ctxs_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_device_, member_->bkcl_ctxs_);
-  }
-#else
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_device_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_device_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_device_);
-  }
-#endif
-
+  std::vector<ir::Graph *> async_graphs =
+      CompileGraphWithBuildStrategy(graph, &graphs, loss_var_name);
   graph = member_->ApplyMemoryOptimizePass(graph);
-
   async_graphs[0] = graph;
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
   //         skip control vars and empty vars
   std::vector<details::VariableInfo> var_infos;
-  for (auto &node : graph->Nodes()) {
-    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-      var_infos.emplace_back();
-      var_infos.back().name_ = node->Var()->Name();
-      var_infos.back().type_ = node->Var()->GetType();
-      var_infos.back().persistable_ = node->Var()->Persistable();
-
-      member_->is_persistable_.emplace(node->Var()->Name(),
-                                       node->Var()->Persistable());
-    }
-  }
-
-  if (graph->Has(details::kFusedVars)) {
-    auto &fused_vars = graph->Get<details::FusedVars>(details::kFusedVars);
-    for (auto &fused_var : fused_vars) {
-      var_infos.emplace_back();
-      var_infos.back() = fused_var.second;
+  CreateVariableInfos(&var_infos, graph);
+  std::unordered_map<Scope *, Scope *> scope_map =
+      CreateLocalExecScopes(member_->local_scopes_, /*create_new*/ true);
 
-      member_->is_persistable_.emplace(fused_var.first,
-                                       fused_var.second.persistable_);
-    }
-  }
+  // Step 4. Create SSAGraph executor
+  std::vector<ir::Graph *> final_graphs =
+      CreateSSAGraphExecutor(exec_strategy, &async_graphs, graph);
 
-  std::unordered_map<Scope *, Scope *> scope_map;
-  for (auto *scope : member_->local_scopes_) {
-    auto &local_exec_scope = scope->NewScope();
-    member_->local_exec_scopes_.emplace_back(&local_exec_scope);
-    scope_map.emplace(scope, &local_exec_scope);
+  VLOG(3) << "use ScopeBufferedSSAGraphExecutor";
+  if (!member_->build_strategy_.async_mode_) {
+    member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+        std::move(var_infos), member_->places_, std::move(member_->executor_)));
   }
 
-  PADDLE_ENFORCE_EQ(
-      member_->local_scopes_.size(), member_->local_exec_scopes_.size(),
-      platform::errors::PreconditionNotMet(
-          "member_->local_scopes_.size() = %d is not equal to "
-          "member_->local_exec_scopes_.size() = %d",
-          member_->local_scopes_.size(), member_->local_exec_scopes_.size()));
+  ResetOpHandleScopeMapOfGraphs(final_graphs, scope_map);
+  SetReaderOpDeviceInfoOfGraphs(final_graphs);
+}
 
-  std::vector<ir::Graph *> final_graphs;
+void ParallelExecutor::BCastParamsToDevices(
+    const std::vector<std::string> &vars, int trainer_id) const {
+  VLOG(3) << "BCastParamsToDevices";
+  // the initializing bcast, all vars would be bcast from device(0).
+  for (auto &var : vars) {
+    framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
+    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
+      continue;
+    }
 
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use AsyncSSAGraphExecutor";
-    member_->executor_.reset(new details::AsyncSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        member_->places_, async_graphs));
-    final_graphs = async_graphs;
-  } else if (member_->build_strategy_.enable_parallel_graph_) {
-    VLOG(3) << "use ParallelSSAGraphExecutor";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    // TODO(Yancey1989): Remove passing in the main_program when
-    // allreduce_seq_pass doesn't need it as the attr.
-    bool is_inference = details::IsDataParallelInferenceGraph(*graph);
-    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
+    auto &main_tensor = main_var->Get<LoDTensor>();
+    if (!main_tensor.IsInitialized()) {
+      VLOG(3) << "one in var not inited, return!";
+      continue;
+    }
+    auto &dims = main_tensor.dims();
+    if (paddle::platform::is_gpu_place(main_tensor.place())) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+      std::vector<void *> buffers;
+      buffers.reserve(member_->places_.size());
+      size_t numel = main_tensor.numel();
+      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
+      for (size_t i = 0; i < member_->places_.size(); ++i) {
+        auto place = member_->places_[i];
+        void *buffer;
 
-    auto *pg_exe = new details::ParallelSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        member_->places_, graph);
-    final_graphs = pg_exe->Graphs();
-    member_->executor_.reset(pg_exe);
-
-    if (is_inference && member_->places_.size() > 1) {
-      member_->inference_executor_ = pg_exe;
-      if (!has_drop_last_read_op) {
-        VLOG(5) << "Enable partial feed support in inference phase";
-        pg_exe->EnablePartialFeedSupport();
-      }
-    }
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Paddle should be compiled with CUDA for ParallelGraph Execution."));
-#endif
-  } else {
-    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
-    auto possible_inference_graphs =
-        details::TrySeparateToMultipleSingleDeviceGraphs(graph);
-    if (!possible_inference_graphs.empty()) {
-      VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase";
-      auto *pg_exe = new details::ParallelSSAGraphExecutor(
-          exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-          member_->places_, std::move(possible_inference_graphs));
-      if (!has_drop_last_read_op) {
-        VLOG(5) << "Enable partial feed support in inference phase";
-        pg_exe->EnablePartialFeedSupport();
-      }
-      final_graphs = pg_exe->Graphs();
-      member_->executor_.reset(pg_exe);
-      member_->inference_executor_ = pg_exe;
-    } else {
-      LOG_IF(WARNING, details::HasKeepLastReadOp(*graph))
-          << "drop_last=False for DataLoader is not supported in training "
-             "network. It is automatically turned to drop_last=True.";
-      if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
-        VLOG(3) << "use ThreadedSSAGraphExecutor";
-        member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-            exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-            member_->places_, graph));
-      } else {
-        if (member_->use_device_ == p::kXPU) {
-#if defined(PADDLE_WITH_XPU)
-          VLOG(3) << "use BindThreadedSSAGraphExecutor";
-          member_->executor_.reset(new details::BindThreadedSSAGraphExecutor(
-              exec_strategy, member_->local_scopes_,
-              member_->local_exec_scopes_, member_->places_, graph));
-#else
-          PADDLE_THROW(platform::errors::PermissionDenied(
-              "Paddle can't use XPU device since it's not compiled with XPU,"
-              "Please recompile or reinstall Paddle with XPU support."));
-#endif
-        } else {
-          VLOG(3) << "use FastThreadedSSAGraphExecutor";
-          member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
-              exec_strategy, member_->local_scopes_,
-              member_->local_exec_scopes_, member_->places_, graph));
-        }
-      }
-      final_graphs.emplace_back(graph);
-    }
-  }
-
-  VLOG(3) << "use ScopeBufferedSSAGraphExecutor";
-  if (!member_->build_strategy_.async_mode_) {
-    member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        std::move(var_infos), member_->places_, std::move(member_->executor_)));
-  }
-
-  for (auto *g : final_graphs) {
-    auto ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*g);
-    for (auto *op : ops) {
-      op->SetLocalExecScopes(scope_map);
-    }
-  }
-
-  if (final_graphs.size() == 1) {
-    ir::SetReaderOpDeviceInfo(final_graphs[0], member_->places_.size());
-  } else {
-    for (size_t i = 0; i < final_graphs.size(); ++i) {
-      ir::SetReaderOpDeviceInfo(final_graphs[i], member_->places_.size(), i);
-    }
-  }
-}
-
-void ParallelExecutor::BCastParamsToDevices(
-    const std::vector<std::string> &vars, int trainer_id) const {
-  VLOG(3) << "BCastParamsToDevices";
-  // the initializing bcast, all vars would be bcast from device(0).
-  for (auto &var : vars) {
-    framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
-    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
-      continue;
-    }
-
-    auto &main_tensor = main_var->Get<LoDTensor>();
-    if (!main_tensor.IsInitialized()) {
-      VLOG(3) << "one in var not inited, return!";
-      continue;
-    }
-    auto &dims = main_tensor.dims();
-    if (paddle::platform::is_gpu_place(main_tensor.place())) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-      std::vector<void *> buffers;
-      buffers.reserve(member_->places_.size());
-      size_t numel = main_tensor.numel();
-      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
-      for (size_t i = 0; i < member_->places_.size(); ++i) {
-        auto place = member_->places_[i];
-        void *buffer;
-
-        if (i == 0 && trainer_id == 0) {
-          buffer = const_cast<void *>(main_tensor.data<void>());
-        } else {
-          auto local_scope = member_->local_scopes_[i];
-          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
-          t->Resize(dims);
-          buffer = t->mutable_data(place, main_tensor.type());
-        }
-        buffers.push_back(buffer);
-      }
+        if (i == 0 && trainer_id == 0) {
+          buffer = const_cast<void *>(main_tensor.data<void>());
+        } else {
+          auto local_scope = member_->local_scopes_[i];
+          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
+          t->Resize(dims);
+          buffer = t->mutable_data(place, main_tensor.type());
+        }
+        buffers.push_back(buffer);
+      }
 
       PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
                         platform::errors::PreconditionNotMet(
@@ -1367,6 +1058,399 @@ bool ParallelExecutor::EnableParallelGraphExecution(
   return enable_parallel_graph;
 }
 
+void ParallelExecutor::InitExecutorPrivateMemberInfo(
+    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
+    size_t device_count, const ir::Graph &graph) {
+  member_->use_device_ = exec_strategy.use_device_;
+  member_->build_strategy_ = build_strategy;
+  member_->use_all_reduce_ = member_->build_strategy_.reduce_ ==
+                             BuildStrategy::ReduceStrategy::kAllReduce;
+  member_->nranks_ = build_strategy.num_trainers_ * device_count;
+  if (!member_->use_all_reduce_ && member_->nranks_ == 1) {
+    LOG(INFO) << "If you set build_strategy.reduce with 'Reduce',"
+                 "the number of places should be greater than 1.";
+    member_->build_strategy_.reduce_ =
+        BuildStrategy::ReduceStrategy::kAllReduce;
+    member_->use_all_reduce_ = true;
+  }
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
+  if (member_->IsUseCUDA(member_->use_device_)) {
+    PADDLE_ENFORCE_EQ(
+        device_count, 1,
+        platform::errors::Unavailable("Windows can support Single GPU only."));
+  }
+#endif
+
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
+  if (member_->IsUseCUDA(member_->use_device_)) {
+    PADDLE_ENFORCE_EQ(
+        device_count, 1,
+        platform::errors::PermissionDenied(
+            "Your machine has multiple cards, "
+            "but the WITH_NCCL option is not turned on during compilation, "
+            "and you cannot use multi-card training or prediction. "
+            "Please recompile and turn on the WITH_NCCL option."));
+  }
+#endif
+
+  std::string device_name;
+  if (member_->use_device_ == p::kCPU) {
+    device_name = "CPU";
+  } else if (member_->use_device_ == p::kCUDA) {
+    device_name = "CUDA";
+  } else {
+    device_name = "XPU";
+  }
+
+  VLOG(1) << string::Sprintf(
+      "The Program will be executed on %s using ParallelExecutor, %lu "
+      "cards are used, so %lu programs are executed in parallel.",
+      device_name, device_count, device_count);
+
+  // FIXME(Yancey1989): parallel graph mode get better performance
+  // in GPU allreduce distributed training. Need an elegant way to
+  // choice the execution strategy.
+  member_->build_strategy_.enable_parallel_graph_ =
+      EnableParallelGraphExecution(graph, exec_strategy,
+                                   member_->build_strategy_);
+  if (member_->build_strategy_.enable_parallel_graph_) {
+    LOG(INFO) << "The Executor would execute the graph by ParallelGraph "
+                 "Execution which can get better performance,"
+              << "you can force it off by env FLAGS_enable_parallel_graph=0";
+  }
+}
+
+void ParallelExecutor::CreateLocalScopes(
+    Scope *global_scope, const std::vector<Scope *> &local_scopes,
+    bool create_new) {
+  if (local_scopes.empty()) {
+    member_->own_local_scope_ = true;
+    member_->local_scopes_.emplace_back(global_scope);
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.emplace_back(&global_scope->NewScope());
+    }
+  } else {
+    member_->own_local_scope_ = false;
+    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size(),
+                      platform::errors::PreconditionNotMet(
+                          "member_->places_.size() = %d is not equal to "
+                          "local_scopes.size() = %d",
+                          member_->places_.size(), local_scopes.size()));
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      if (create_new) {
+        member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
+      } else {
+        // Use local scopes directly
+        member_->local_scopes_.emplace_back(local_scopes[i]);
+      }
+    }
+  }
+}
+
+std::unordered_map<Scope *, Scope *> ParallelExecutor::CreateLocalExecScopes(
+    const std::vector<Scope *> &local_scopes, bool create_new) {
+  std::unordered_map<Scope *, Scope *> scope_map;
+
+  for (auto *scope : local_scopes) {
+    Scope *local_exec_scope = scope;
+    if (create_new) {
+      local_exec_scope = &scope->NewScope();
+    }
+    member_->local_exec_scopes_.emplace_back(local_exec_scope);
+    scope_map.emplace(scope, local_exec_scope);
+  }
+
+  PADDLE_ENFORCE_EQ(
+      member_->local_scopes_.size(), member_->local_exec_scopes_.size(),
+      platform::errors::PreconditionNotMet(
+          "member_->local_scopes_.size() = %d is not equal to "
+          "member_->local_exec_scopes_.size() = %d",
+          member_->local_scopes_.size(), member_->local_exec_scopes_.size()));
+
+  return scope_map;
+}
+
+std::vector<ir::Graph *> ParallelExecutor::CloneGraphToMultiDevices(
+    ir::Graph *graph) {
+  std::vector<ir::Graph *> graphs;
+  if (member_->build_strategy_.async_mode_) {
+    PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false,
+                      platform::errors::Unavailable(
+                          "gpu mode does not support async_mode_ now!"));
+    graphs.push_back(graph);
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
+      auto *tmp_graph = new ir::Graph(graph->OriginProgram());
+      async_graphs_.emplace_back(tmp_graph);
+      graphs.push_back(tmp_graph);
+    }
+  }
+
+  return graphs;
+}
+
+void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
+  if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    member_->InitOrGetNCCLCommunicator(global_scope, &member_->build_strategy_);
+
+    // Initialize device context's nccl comm, will be used by normal
+    // Operators like sync_batch_norm, and collective ops.
+    // NOTE: more than one ParallelExecutor with same place, the nccl comm will
+    // be rewrite and there will be some problem.
+    // NOTE: NCCL group-calls and non-group-calls can not use the same
+    // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
+    // same communicators.
+    auto *nccl_ctxs = member_->nccl_ctxs_->GetSyncBatchNormCtx(
+        global_scope, member_->places_);
+    auto &pool = platform::DeviceContextPool::Instance();
+    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          pool.Get(member_->places_[dev_id]));
+      auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]);
+      dev_ctx->set_nccl_comm(nccl_ctx.comm());
+    }
+#else
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with CUDA."));
+#endif
+  }
+  if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) {
+#if defined(PADDLE_WITH_XPU_BKCL)
+    member_->InitOrGetBKCLCommunicator(global_scope, member_->build_strategy_);
+
+    auto *bkcl_ctxs = member_->bkcl_ctxs_->GetSyncBatchNormCtx(
+        global_scope, member_->places_);
+    auto &pool = platform::DeviceContextPool::Instance();
+    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
+      auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
+          pool.Get(member_->places_[dev_id]));
+      auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
+      dev_ctx->set_bkcl_context(bkcl_ctx.comm());
+    }
+#else
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with XPU."));
+#endif
+  }
+}
+
+std::vector<ir::Graph *> ParallelExecutor::CompileGraphWithBuildStrategy(
+    ir::Graph *graph, std::vector<ir::Graph *> *device_graphs,
+    const std::string &loss_var_name) {
+  auto device_count = member_->places_.size();
+  std::vector<ir::Graph *> async_graphs(device_count);
+
+  auto &graphs = *device_graphs;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  if (member_->build_strategy_.async_mode_) {
+    PADDLE_ENFORCE_EQ(graphs.size(), device_count,
+                      platform::errors::PreconditionNotMet(
+                          "graphs.size() shoule be %d, but received %d",
+                          device_count, graphs.size()));
+    VLOG(3) << "use local async mode";
+    graph = member_->build_strategy_.Apply(
+        graph, {member_->places_[0]}, loss_var_name,
+        {member_->local_scopes_[0]}, 1, member_->use_device_,
+        member_->nccl_ctxs_);
+    for (size_t i = 1; i < device_count; ++i) {
+      graphs[i] = member_->build_strategy_.Apply(
+          graphs[i], {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, 1, member_->use_device_,
+          member_->nccl_ctxs_);
+      async_graphs[i] = graphs[i];
+    }
+  } else {
+    graph = member_->build_strategy_.Apply(
+        graph, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_device_, member_->nccl_ctxs_);
+  }
+#elif defined(PADDLE_WITH_XPU_BKCL)
+  if (member_->build_strategy_.async_mode_) {
+    PADDLE_ENFORCE_EQ(graphs.size(), device_count,
+                      platform::errors::PreconditionNotMet(
+                          "graphs.size() shoule be %d, but received %d",
+                          device_count, graphs.size()));
+    VLOG(3) << "use local async mode";
+    graph = member_->build_strategy_.Apply(
+        graph, {member_->places_[0]}, loss_var_name,
+        {member_->local_scopes_[0]}, 1, member_->use_device_,
+        member_->bkcl_ctxs_);
+    for (size_t i = 1; i < device_count; ++i) {
+      graphs[i] = member_->build_strategy_.Apply(
+          graphs[i], {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, 1, member_->use_device_,
+          member_->bkcl_ctxs_);
+      async_graphs[i] = graphs[i];
+    }
+  } else {
+    graph = member_->build_strategy_.Apply(
+        graph, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_device_, member_->bkcl_ctxs_);
+  }
+#else
+  if (member_->build_strategy_.async_mode_) {
+    VLOG(3) << "use local async mode";
+    graph = member_->build_strategy_.Apply(
+        graph, {member_->places_[0]}, loss_var_name,
+        {member_->local_scopes_[0]}, 1, member_->use_device_);
+    for (size_t i = 1; i < device_count; ++i) {
+      graphs[i] = member_->build_strategy_.Apply(
+          graphs[i], {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, 1, member_->use_device_);
+      async_graphs[i] = graphs[i];
+    }
+  } else {
+    graph = member_->build_strategy_.Apply(
+        graph, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_device_);
+  }
+#endif
+
+  return async_graphs;
+}
+
+void ParallelExecutor::CreateVariableInfos(
+    std::vector<details::VariableInfo> *var_infos, ir::Graph *graph) {
+  PADDLE_ENFORCE_EQ(
+      var_infos->size(), 0,
+      platform::errors::PreconditionNotMet(
+          "var_infos->size() shoule be 0, but received %d", var_infos->size()));
+  PADDLE_ENFORCE_EQ(
+      member_->is_persistable_.size(), 0,
+      platform::errors::PreconditionNotMet(
+          "member_->is_persistable_.size() shoule be 0, but received %d",
+          member_->is_persistable_.size()));
+  for (auto &node : graph->Nodes()) {
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+      var_infos->emplace_back();
+      var_infos->back().name_ = node->Var()->Name();
+      var_infos->back().type_ = node->Var()->GetType();
+      var_infos->back().persistable_ = node->Var()->Persistable();
+
+      member_->is_persistable_.emplace(node->Var()->Name(),
+                                       node->Var()->Persistable());
+    }
+  }
+
+  if (graph->Has(details::kFusedVars)) {
+    auto &fused_vars = graph->Get<details::FusedVars>(details::kFusedVars);
+    for (auto &fused_var : fused_vars) {
+      var_infos->emplace_back();
+      var_infos->back() = fused_var.second;
+
+      member_->is_persistable_.emplace(fused_var.first,
+                                       fused_var.second.persistable_);
+    }
+  }
+}
+
+std::vector<ir::Graph *> ParallelExecutor::CreateSSAGraphExecutor(
+    const ExecutionStrategy &exec_strategy,
+    std::vector<ir::Graph *> *async_graphs, ir::Graph *graph) {
+  std::vector<ir::Graph *> final_graphs;
+
+  if (member_->build_strategy_.async_mode_) {
+    VLOG(3) << "use AsyncSSAGraphExecutor";
+    member_->executor_.reset(new details::AsyncSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+        member_->places_, *async_graphs));
+    final_graphs = *async_graphs;
+  } else if (member_->build_strategy_.enable_parallel_graph_) {
+    VLOG(3) << "use ParallelSSAGraphExecutor";
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    // TODO(Yancey1989): Remove passing in the main_program when
+    // allreduce_seq_pass doesn't need it as the attr.
+    bool is_inference = details::IsDataParallelInferenceGraph(*graph);
+    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
+
+    auto *pg_exe = new details::ParallelSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+        member_->places_, graph);
+    final_graphs = pg_exe->Graphs();
+    member_->executor_.reset(pg_exe);
+
+    if (is_inference && member_->places_.size() > 1) {
+      member_->inference_executor_ = pg_exe;
+      if (!has_drop_last_read_op) {
+        VLOG(5) << "Enable partial feed support in inference phase";
+        pg_exe->EnablePartialFeedSupport();
+      }
+    }
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Paddle should be compiled with CUDA for ParallelGraph Execution."));
+#endif
+  } else {
+    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
+    auto possible_inference_graphs =
+        details::TrySeparateToMultipleSingleDeviceGraphs(graph);
+    if (!possible_inference_graphs.empty()) {
+      VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase";
+      auto *pg_exe = new details::ParallelSSAGraphExecutor(
+          exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+          member_->places_, std::move(possible_inference_graphs));
+      if (!has_drop_last_read_op) {
+        VLOG(5) << "Enable partial feed support in inference phase";
+        pg_exe->EnablePartialFeedSupport();
+      }
+      final_graphs = pg_exe->Graphs();
+      member_->executor_.reset(pg_exe);
+      member_->inference_executor_ = pg_exe;
+    } else {
+      LOG_IF(WARNING, details::HasKeepLastReadOp(*graph))
+          << "drop_last=False for DataLoader is not supported in training "
+             "network. It is automatically turned to drop_last=True.";
+      if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
+        VLOG(3) << "use ThreadedSSAGraphExecutor";
+        member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
+            exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+            member_->places_, graph));
+      } else {
+        VLOG(3) << "use FastThreadedSSAGraphExecutor";
+        member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
+            exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+            member_->places_, graph));
+      }
+      final_graphs.emplace_back(graph);
+    }
+  }
+  return final_graphs;
+}
+
+void ParallelExecutor::ResetOpHandleScopeMapOfGraphs(
+    const std::vector<ir::Graph *> &final_graphs,
+    const std::unordered_map<Scope *, Scope *> &scope_map) {
+  PADDLE_ENFORCE_GE(
+      final_graphs.size(), 1,
+      platform::errors::PreconditionNotMet(
+          "final_graphs shoule contain at least one graph, but received %d",
+          final_graphs.size()));
+
+  PADDLE_ENFORCE_GT(scope_map.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "scope_map shoule contain at least one "
+                        "element, but received %d",
+                        scope_map.size()));
+  for (auto *g : final_graphs) {
+    auto ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*g);
+    for (auto *op : ops) {
+      op->SetLocalExecScopes(scope_map);
+    }
+  }
+}
+
+void ParallelExecutor::SetReaderOpDeviceInfoOfGraphs(
+    const std::vector<ir::Graph *> &final_graphs) {
+  if (final_graphs.size() == 1) {
+    ir::SetReaderOpDeviceInfo(final_graphs[0], member_->places_.size());
+  } else {
+    for (size_t i = 0; i < final_graphs.size(); ++i) {
+      ir::SetReaderOpDeviceInfo(final_graphs[i], member_->places_.size(), i);
+    }
+  }
+}
+
 const ir::Graph &ParallelExecutor::Graph() const {
   return member_->executor_->Graph();
 }
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 47de7dc48f4f2c..d4d0b534b55f05 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_info.h"
@@ -41,6 +42,7 @@ namespace framework {
 
 class ParallelExecutorPrivate;
 
+using details::VariableInfo;
 using details::BuildStrategy;
 using details::ExecutionStrategy;
 namespace p = paddle::platform;
@@ -93,6 +95,40 @@ class ParallelExecutor {
                                     const ExecutionStrategy &exec_strategy,
                                     const BuildStrategy &build_strategy) const;
 
+  void InitExecutorPrivateMemberInfo(const ExecutionStrategy &exec_strategy,
+                                     const BuildStrategy &build_strategy,
+                                     size_t device_count,
+                                     const ir::Graph &graph);
+
+  void CreateLocalScopes(Scope *global_scope,
+                         const std::vector<Scope *> &local_scopes,
+                         bool create_new);
+
+  std::unordered_map<Scope *, Scope *> CreateLocalExecScopes(
+      const std::vector<Scope *> &local_scopes, bool create_new);
+
+  std::vector<ir::Graph *> CloneGraphToMultiDevices(ir::Graph *graph);
+
+  void PrepareNCCLCommunicator(Scope *global_scope);
+
+  std::vector<ir::Graph *> CompileGraphWithBuildStrategy(
+      ir::Graph *graph, std::vector<ir::Graph *> *graphs,
+      const std::string &loss_var_name);
+
+  void CreateVariableInfos(std::vector<VariableInfo> *var_infos,
+                           ir::Graph *graph);
+
+  std::vector<ir::Graph *> CreateSSAGraphExecutor(
+      const ExecutionStrategy &exec_strategy,
+      std::vector<ir::Graph *> *async_graphs, ir::Graph *graph);
+
+  void ResetOpHandleScopeMapOfGraphs(
+      const std::vector<ir::Graph *> &final_graphs,
+      const std::unordered_map<Scope *, Scope *> &scope_map);
+
+  void SetReaderOpDeviceInfoOfGraphs(
+      const std::vector<ir::Graph *> &final_graphs);
+
   ParallelExecutorPrivate *member_;
   std::vector<std::unique_ptr<ir::Graph>> async_graphs_;
 };
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index 5968df548dfb0f..75c42fa3e52736 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"
@@ -34,7 +35,11 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
   ParseDumpConfig(trainer_desc);
   const auto& section_config = section_params.section_config();
   int place_id = section_config.place_id();
+#if (defined PADDLE_WITH_NCCL)
   place_ = platform::CUDAPlace(place_id);
+#elif (defined PADDLE_WITH_ASCEND_CL)  // NOLINT
+  place_ = platform::NPUPlace(place_id);
+#endif
   worker_ = DeviceWorkerFactory::CreateDeviceWorker(
       trainer_desc.device_worker_name());
   auto this_worker =
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index e740771e5ca9fc..00ff50abadd185 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -9,7 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 #include <float.h>
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 4c30c40ad58375..7e48d0dc5f9620 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -113,6 +113,21 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
   TensorToStream(os, selected_rows.value(), dev_ctx);
 }
 
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext* dev_ctx;
+  auto place = selected_rows.place();
+  dev_ctx = pool.Get(place);
+  SerializeToStream(os, selected_rows, *dev_ctx);
+}
+
+void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext* dev_ctx;
+  dev_ctx = pool.Get(platform::CPUPlace());
+  DeserializeFromStream(os, selected_rows, *dev_ctx);
+}
+
 void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
                            const platform::DeviceContext& dev_ctx) {
   {
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 48353b43f56cac..e53e3d973c5246 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -173,5 +173,9 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
 void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
                            const platform::DeviceContext& dev_ctx);
 
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows);
+
+void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index d6882b25d22588..78fd1af09e2945 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -822,6 +822,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
           "XPUPlace is not supported when not compiled with XPU"));
+#endif
+    } else if (platform::is_npu_place(tensor.place())) {
+#ifdef PADDLE_WITH_ASCEND_CL
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& npu_dev_ctx =
+          static_cast<const platform::NPUDeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     BOOST_GET_CONST(platform::NPUPlace, tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     npu_dev_ctx.stream());
+        npu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "NPUPlace is not supported when not compiled with NPU"));
 #endif
     } else {
       os.write(static_cast<const char*>(data_ptr),
@@ -877,9 +900,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     auto ctx = platform::CPUDeviceContext();
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
-        platform::is_xpu_place(dev_ctx.GetPlace())) {
+        platform::is_xpu_place(dev_ctx.GetPlace()) ||
+        platform::is_npu_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(shape));
       framework::VisitDataType(
@@ -888,13 +912,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       is.read(static_cast<char*>(buf), size);
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
+      if (platform::is_npu_place(dev_ctx.GetPlace())) {
+        dev_ctx.Wait();
+      }
 #else
       if (platform::is_gpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "CUDAPlace is not supported when not compiled with CUDA"));
-      } else {
+      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "XPUPlace is not supported when not compiled with XPU"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "NPUPlace is not supported when not compiled with NPU"));
       }
 #endif
     } else {
@@ -935,9 +965,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
     auto ctx = platform::CPUDeviceContext();
     size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
-        platform::is_xpu_place(dev_ctx.GetPlace())) {
+        platform::is_xpu_place(dev_ctx.GetPlace()) ||
+        platform::is_npu_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
       Tensor cpu_tensor;
       cpu_tensor.Resize(framework::make_ddim(dims));
       framework::VisitDataType(
@@ -946,13 +977,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
       is.read(static_cast<char*>(buf), size);
       auto dst_place = dev_ctx.GetPlace();
       framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
+      if (platform::is_npu_place(dev_ctx.GetPlace())) {
+        dev_ctx.Wait();
+      }
 #else
       if (platform::is_gpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "CUDAPlace is not supported when not compiled with CUDA"));
-      } else {
+      } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
         PADDLE_THROW(platform::errors::Unimplemented(
             "XPUPlace is not supported when not compiled with XPU"));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "NPUPlace is not supported when not compiled with NPU"));
       }
 #endif
     } else {
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 85af9e50087024..22c8e1c1665f12 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -159,11 +159,15 @@ void TensorFromVector(const std::vector<T>& src,
   }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
+  // NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from
+  // cudaMemcpyAsync.
+  // cudaMemcpyAsync is actually "sync" between cpu <-> gpu.
+  // aclrtMemcpyAsync is really "async" between cpu <-> npu.
+  // Since vector is on cpu, I think this function should be a "sync" operation,
+  // so pass nullptr as stream to  memory::Copy().
   else if (platform::is_npu_place(dst_place)) {  // NOLINT
-    memory::Copy(
-        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place,
-        src_ptr, size,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 src_place, src_ptr, size, nullptr);
   }
 #endif
 }
@@ -202,10 +206,8 @@ inline void TensorFromVector(const std::vector<bool>& src,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   else if (platform::is_npu_place(dst_place)) {  // NOLINT
-    memory::Copy(
-        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place,
-        src_ptr, size,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+                 src_place, src_ptr, size, nullptr);
   }
 #endif
   delete[] array;
@@ -265,10 +267,9 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   else if (platform::is_npu_place(src.place())) {  // NOLINT
-    memory::Copy(
-        dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()),
-        src_ptr, size,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
+    memory::Copy(dst_place, dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
+                 size, nullptr);
   }
 #endif
 }
@@ -301,10 +302,9 @@ inline void TensorToVector(const Tensor& src,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
   else if (platform::is_npu_place(src.place())) {  // NOLINT
-    memory::Copy(
-        dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()),
-        src_ptr, size,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
+    memory::Copy(dst_place, dst_ptr,
+                 BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
+                 size, nullptr);
   }
 #endif
   for (unsigned int i = 0; i < src.numel(); i++) {
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 7efb89ad7d9d9c..3ac36bd2e4a248 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -26,7 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_feed.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/device_worker.h"
-#include "paddle/fluid/framework/fleet/heter_context.h"
 #include "paddle/fluid/framework/fleet/heter_wrapper.h"
 #include "paddle/fluid/framework/heter_service.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -332,7 +331,8 @@ class PSGPUTrainer : public TrainerBase {
 };
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 class PipelineTrainer : public TrainerBase {
  public:
   PipelineTrainer() {}
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index 6b9dbece8974c2..15073b6f78c5b3 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -76,7 +76,8 @@ REGISTER_TRAINER_CLASS(HeterBoxTrainer);
     (defined PADDLE_WITH_PSLIB)
 REGISTER_TRAINER_CLASS(PSGPUTrainer);
 #endif
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL)
 REGISTER_TRAINER_CLASS(PipelineTrainer);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc
index d2adbdd34512b3..0f8465ab8948e4 100644
--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
@@ -53,27 +53,28 @@ static const std::unordered_set<std::string> &GetOpWithUnusedVarAllowSet() {
   // Use pointer here for safe static deinitialization
   static auto *allow_set = new std::unordered_set<std::string>({
       // called once
-      "batch_norm",                      // 0
-      "batch_norm_grad",                 // 0
-      "sync_batch_norm",                 // 0
-      "sync_batch_norm_grad",            // 0
-      "inplace_abn",                     // 0
-      "inplace_abn_grad",                // 0
-      "dgc_momentum",                    // 0
-      "fake_quantize_range_abs_max",     // 0
-      "rmsprop",                         // 0
-      "sequence_conv_grad",              // 0
-      "roi_perspective_transform_grad",  // 0
-      "fill_zeros_like",                 // 1
-      "fill_any_like",                   // 1
-      "nce_grad",                        // 1
-      "precision_recall",                // 1
-      "fusion_seqpool_cvm_concat",       // 2
-      "fused_batch_norm_act",            // 2
-      "fused_batch_norm_act_grad",       // 2
-      "data_norm",                       // 0
-      "data_norm_grad",                  // 0
-      "update_loss_scaling",             // 0
+      "batch_norm",                         // 0
+      "batch_norm_grad",                    // 0
+      "sync_batch_norm",                    // 0
+      "sync_batch_norm_grad",               // 0
+      "inplace_abn",                        // 0
+      "inplace_abn_grad",                   // 0
+      "dgc_momentum",                       // 0
+      "fake_quantize_range_abs_max",        // 0
+      "rmsprop",                            // 0
+      "sequence_conv_grad",                 // 0
+      "roi_perspective_transform_grad",     // 0
+      "fill_zeros_like",                    // 1
+      "fill_any_like",                      // 1
+      "nce_grad",                           // 1
+      "precision_recall",                   // 1
+      "fusion_seqpool_cvm_concat",          // 2
+      "fused_batch_norm_act",               // 2
+      "fused_batch_norm_act_grad",          // 2
+      "data_norm",                          // 0
+      "data_norm_grad",                     // 0
+      "update_loss_scaling",                // 0
+      "fused_embedding_eltwise_layernorm",  // 0
   });
   return *allow_set;
 }
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index fc754cbaf177c9..473df85aa0421e 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -36,6 +36,11 @@
 #endif
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <hccl/hccl.h>
+#include <hccl/hccl_types.h>
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
 #endif
@@ -50,6 +55,10 @@ class Communicator;
 class NCCLCommunicator;
 #endif
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+class Communicator;
+class HCCLCommunicator;
+#endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
 class BKCLCommunicator;
@@ -162,6 +171,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
 #endif
     operators::CudnnRNNCache,
 #endif
+#if defined(PADDLE_WITH_ASCEND_CL)
+    HcclRootInfo,
+#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
     BKCLUniqueId, platform::BKCLCommunicator,
 #endif
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index a24c0ac09c7587..6bee3d44b2edd7 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -4,7 +4,7 @@ cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
 add_subdirectory(jit)
 cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
-cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp)
+cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal)
 cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
 cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
 cc_library(imperative_profiler SRCS profiler.cc)
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index a56458b21398b3..fd2bb6e5c99522 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -26,7 +26,24 @@ class VarBase;
 
 AmpOperators::AmpOperators()
     : allow_ops_(new std::unordered_set<std::string>()),
-      block_ops_(new std::unordered_set<std::string>()) {}
+      block_ops_(new std::unordered_set<std::string>()),
+      unsupported_fp16_ops_(new std::unordered_set<std::string>()) {
+  auto& all_kernels = framework::OperatorWithKernel::AllOpKernels();
+  auto fp16_dtype = framework::proto::VarType::FP16;
+  for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
+    bool supported = false;
+    for (auto& kernel_type : it->second) {
+      if (platform::is_gpu_place(kernel_type.first.place_) &&
+          kernel_type.first.data_type_ == fp16_dtype) {
+        supported = true;
+      }
+    }
+    if (!supported) {
+      unsupported_fp16_ops_->insert(it->first);
+    }
+  }
+}
+
 AmpOperators::~AmpOperators() {}
 
 AmpOperators& AmpOperators::Instance() {
@@ -44,16 +61,26 @@ AmpOperators::GetMutableBlockOps() {
   return block_ops_;
 }
 
+std::shared_ptr<std::unordered_set<std::string>>
+AmpOperators::GetMutableUnsupportedFp16Ops() {
+  return unsupported_fp16_ops_;
+}
+
 std::ostream& operator<<(std::ostream& os, AmpOperators& ops) {
   os << "allow ops: ";
   auto allow_ops = ops.GetMutableAllowOps();
   std::copy((*allow_ops).begin(), (*allow_ops).end(),
             std::ostream_iterator<std::string>(os, " "));
-  os << "; ";
+  os << "\n";
   os << "block ops: ";
   auto block_ops = ops.GetMutableBlockOps();
   std::copy((*block_ops).begin(), (*block_ops).end(),
             std::ostream_iterator<std::string>(os, " "));
+  os << "\n";
+  os << "unsupported fp16 ops: ";
+  auto unsupported_fp16_ops = ops.GetMutableUnsupportedFp16Ops();
+  std::copy((*unsupported_fp16_ops).begin(), (*unsupported_fp16_ops).end(),
+            std::ostream_iterator<std::string>(os, " "));
   return os;
 }
 
@@ -156,6 +183,12 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type,
     return new_ins;
   } else {
     auto dst_type = GetPromoteType(ins);
+    // NOTE(zhiqiu): if the op has op fp16 kernel, fall back to fp32.
+    if (dst_type == framework::proto::VarType::FP16 &&
+        AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count(
+            op_type)) {
+      dst_type = framework::proto::VarType::FP32;
+    }
     for (auto& pair : new_ins) {
       // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16.
       if ((op_type == "batch_norm" || op_type == "layer_norm") &&
diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h
index 619c6b0baf896f..fa76c19688a693 100644
--- a/paddle/fluid/imperative/amp_auto_cast.h
+++ b/paddle/fluid/imperative/amp_auto_cast.h
@@ -40,6 +40,9 @@ class AmpOperators {
 
   std::shared_ptr<std::unordered_set<std::string>> GetMutableBlockOps();
 
+  std::shared_ptr<std::unordered_set<std::string>>
+  GetMutableUnsupportedFp16Ops();
+
  private:
   AmpOperators();  // forbid calling default constructor
 
@@ -50,6 +53,9 @@ class AmpOperators {
   // The set of ops that support fp16 calculation and are considered numerically
   // dangerous and whose effects may also be observed in downstream ops.
   std::shared_ptr<std::unordered_set<std::string>> block_ops_;
+
+  // The set of ops that has no fp16 CUDA kennel.
+  std::shared_ptr<std::unordered_set<std::string>> unsupported_fp16_ops_;
 };
 
 std::ostream& operator<<(std::ostream& os, AmpOperators& ops);
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index d5350744e4c553..7bcc3d6c608c94 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -408,7 +408,8 @@ void BasicEngine::Execute() {
             VLOG(10) << "create temporary var of " << var->Name()
                      << " for sum gradient within this graph!";
           } else if (!inplace_grad_name_map.empty() &&
-                     inplace_grad_name_map.count(pair.first)) {
+                     inplace_grad_name_map.count(pair.first) &&
+                     bwd_ins.count(inplace_grad_name_map.at(pair.first))) {
             // When calculate Inplace grad op, create a new output var.
             // If a tmp var has been created, there is no need to create it
             // again.
@@ -470,12 +471,20 @@ void BasicEngine::Execute() {
 
       {
         VLOG(3) << "Start to execute grad op " << cur_op.Type();
-        if (tmp_ins_ptr == nullptr) {
-          OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(),
-                      cur_op.place());
-        } else {
-          OpBase::Run(cur_op.InnerOp(), *tmp_ins_ptr, tmp_outs, cur_op.Attrs(),
-                      cur_op.place());
+        try {
+          if (tmp_ins_ptr == nullptr) {
+            OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(),
+                        cur_op.place());
+          } else {
+            OpBase::Run(cur_op.InnerOp(), *tmp_ins_ptr, tmp_outs,
+                        cur_op.Attrs(), cur_op.place());
+          }
+        } catch (platform::EnforceNotMet& exception) {
+          Clear();
+          throw std::move(exception);
+        } catch (std::exception& ex) {
+          Clear();
+          PADDLE_THROW(platform::errors::External("%s", ex.what()));
         }
       }
 
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 70359dc3fd25bf..a4af3117d3e32e 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -187,6 +187,7 @@ size_t VarBase::GradOpNum() const {
 }
 
 void VarBase::ClearGradient() {
+  VLOG(4) << "ClearGradient " << Name();
   if (grad_var_) {
     if (grad_var_->Var().IsType<framework::SelectedRows>()) {
       auto* grad_t =
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index b91fc460781c79..9f036742f0f5dd 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -35,7 +35,7 @@ namespace imperative {
 
 void NCCLParallelContext::BcastNCCLId(
     std::vector<ncclUniqueId> &nccl_ids,  // NOLINT
-    int root) {
+    int root, int server_fd) {
   if (strategy_.local_rank_ == root) {
     std::vector<std::string> other_trainers;
     for (auto &ep : strategy_.trainer_endpoints_) {
@@ -45,11 +45,14 @@ void NCCLParallelContext::BcastNCCLId(
     }
     platform::SendBroadCastCommID(other_trainers, &nccl_ids);
   } else {
-    platform::RecvBroadCastCommID(strategy_.current_endpoint_, &nccl_ids);
+    platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_,
+                                  &nccl_ids);
   }
 }
 
 void NCCLParallelContext::Init() {
+  int server_fd = -1;
+
   std::vector<ncclUniqueId> nccl_ids;
   nccl_ids.resize(strategy_.nrings_);
 
@@ -58,8 +61,13 @@ void NCCLParallelContext::Init() {
     for (size_t i = 0; i < nccl_ids.size(); ++i) {
       platform::dynload::ncclGetUniqueId(&nccl_ids[i]);
     }
+  } else {
+    // FIXME(wangxi): gloo will use rank0 endpoint, so not create socket server
+    // on rank0.
+    server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_)
+                    .socket();
   }
-  BcastNCCLId(nccl_ids, 0);
+  BcastNCCLId(nccl_ids, 0, server_fd);
 
   int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device;
   for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) {
@@ -80,14 +88,20 @@ void NCCLParallelContext::Init() {
 }
 
 void NCCLParallelContext::InitWithRingID(int ring_id) {
+  int server_fd = -1;
   std::vector<ncclUniqueId> nccl_ids;
   nccl_ids.resize(1);
 
   if (strategy_.local_rank_ == 0) {
     // generate the unique ncclid on the root worker
     platform::dynload::ncclGetUniqueId(&nccl_ids[0]);
+  } else {
+    // FIXME(wangxi): gloo will use rank0 endpoint, so not create socket server
+    // on rank0.
+    server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_)
+                    .socket();
   }
-  BcastNCCLId(nccl_ids, 0);
+  BcastNCCLId(nccl_ids, 0, server_fd);
 
   int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device;
   VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h
index bcaeb811b108c5..1eee393aa714bb 100644
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@@ -49,7 +49,8 @@ class NCCLParallelContext : public ParallelContext {
 
   ~NCCLParallelContext() override = default;
 
-  void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids, int root);  // NOLINT
+  void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids, int root,  // NOLINT
+                   int server_fd);
 
   void Init() override;
 
diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h
index bd132f2576fec1..de5f9d75e9173a 100644
--- a/paddle/fluid/imperative/py_layer_fwd.h
+++ b/paddle/fluid/imperative/py_layer_fwd.h
@@ -63,15 +63,16 @@ std::shared_ptr<GradOpNode> CreateGradOpNode(
   }
 }
 
-py::object PyLayerApply(const platform::Place& place, const py::object& cls,
+py::object PyLayerApply(const platform::Place& place, const py::handle& cls,
                         const py::args args, const py::kwargs kwargs) {
+  py::gil_scoped_acquire guard;
   auto bk_function = cls.attr("_backward_function");
   auto context = bk_function();
   auto forward = cls.attr("forward");
 
   auto result_forward = forward(context, *args, **kwargs);
   std::shared_ptr<operators::PyLayerContext> py_layer_ctx =
-      std::make_shared<operators::PyLayerContext>(context.release().ptr());
+      std::make_shared<operators::PyLayerContext>(context.ptr());
   // make inputs to varbase
   std::vector<std::shared_ptr<imperative::VarBase>> input_vars;
   // process args,`input_vars` only collect `imperative::VarBase`
@@ -115,12 +116,12 @@ py::object PyLayerApply(const platform::Place& place, const py::object& cls,
               tuple_result[i].cast<std::shared_ptr<imperative::VarBase>>();
           output_vars.push_back(temp_out);
         } catch (py::cast_error&) {
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "The output of `PyLayer.forward` should be `Tensor`."));
+          // Only collect Tensor type in 'kwargs' and pass them to backward.
+          // Ignore other types of input temporarily.
         }
       } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "The output of `PyLayer.forward` can not be `None`."));
+        // Only collect Tensor type in 'kwargs' and pass them to backward.
+        // Ignore other types of input temporarily.
       }
     }
   } else {
@@ -130,14 +131,18 @@ py::object PyLayerApply(const platform::Place& place, const py::object& cls,
             result_forward.cast<std::shared_ptr<imperative::VarBase>>();
         output_vars.push_back(temp_out);
       } catch (py::cast_error&) {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "The output of `PyLayer.forward` should be `Tensor`."));
+        // Only collect Tensor type in 'kwargs' and pass them to backward.
+        // Ignore other types of input temporarily.
       }
     } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "The output of `PyLayer.forward` can not be `None`."));
+      // Only collect Tensor type in 'kwargs' and pass them to backward.
+      // Ignore other types of input temporarily.
     }
   }
+  if (output_vars.size() == 0) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "At least one output of `PyLayer.forward` is a `Tensor`."));
+  }
 
   NameVarBaseMap outs = {{"Out", output_vars}};
 
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index a92704ce447dc1..e3dd0a2aa75b41 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -443,10 +443,6 @@ void Reducer::PrepareDeps(const std::unordered_set<GradOpNode *> &init_nodes) {
     auto *cur_node = q.front();
     q.pop();
 
-    for (auto &cur_op : *cur_node) {
-      cur_op.EnforceHasInOut();
-    }
-
     const auto &grad_pending_nodes = cur_node->GradPendingNodes();
     for (auto &grad_pending_node : grad_pending_nodes) {
       PADDLE_ENFORCE_NOT_NULL(
@@ -523,7 +519,6 @@ void Reducer::PrepareForBackward(
     q.pop();
 
     for (const auto &cur_op : *cur_node) {
-      cur_op.EnforceHasInOut();
       auto &bwd_outs = cur_op.GetOutsMap();
       for (const auto &pair : bwd_outs) {
         if (!pair.second.IsGrad()) {
@@ -762,10 +757,11 @@ void Reducer::MarkGroupReady(size_t group_index) {
     // TODO(liuyuhui): Add try catch to deal with exception later,
     // otherwise the main thread will continue to run when an exception is
     // thrown in comm_pool_.
-    comm_pool_->enqueue([&] {
+    auto next_group = next_group_;
+    comm_pool_->enqueue([this, run_order, next_group, &group] {
       auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
       platform::SetXPUDeviceId(dev_id);
-      FusedAllReduceSchedule(run_order, group, next_group_);
+      FusedAllReduceSchedule(run_order, group, next_group);
       {
         std::lock_guard<std::mutex> lock(mutex_);
         comm_op_count_ -= 1;  // lock
diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc
index 4967df5341d355..2d8a08217b0b83 100644
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
@@ -15,6 +15,7 @@
 #include <thread>  // NOLINT
 
 #include "paddle/fluid/imperative/nccl_context.h"
+#include "paddle/fluid/platform/gen_comm_id_helper.h"
 
 #include "gtest/gtest.h"
 
@@ -36,9 +37,13 @@ imperative::ParallelStrategy GetStrategy(int local_rank) {
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 void BcastNCCLId(int local_rank, std::vector<ncclUniqueId>* nccl_ids) {
   auto strategy = GetStrategy(local_rank);
+  int server_fd = platform::CreateListenSocket(strategy.current_endpoint_);
+
   platform::CUDAPlace gpu(local_rank);
   imperative::NCCLParallelContext ctx(strategy, gpu);
-  ctx.BcastNCCLId(*nccl_ids, 0);
+  ctx.BcastNCCLId(*nccl_ids, 0, server_fd);
+
+  platform::CloseSocket(server_fd);
 }
 
 TEST(BcastNCCLId, Run) {
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 777cb10e0754c3..41ad70e5a5741b 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/imperative/amp_auto_cast.h"
 #include "paddle/fluid/imperative/op_base.h"
+#include "paddle/fluid/platform/denormal.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
 
@@ -83,7 +84,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
   if (gcs_.count(place) == 0) {
     std::unique_ptr<framework::GarbageCollector> gc;
     if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       gc.reset(new framework::DefaultStreamGarbageCollector(
           BOOST_GET_CONST(platform::CUDAPlace, place), 0));
 
@@ -94,7 +95,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
           "Please recompile or reinstall Paddle with GPU support."));
 #endif
     } else if (platform::is_cuda_pinned_place(place)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       gc.reset(new framework::CUDAPinnedGarbageCollector(
           BOOST_GET_CONST(platform::CUDAPinnedPlace, place), 0));
 
@@ -134,6 +135,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
                      const platform::Place& place, bool trace_backward,
                      const std::map<std::string, std::string>& inplace_map) {
   platform::RecordEvent op_type_record_event(type);
+  platform::ScopedFlushDenormal flush;
   VLOG(1) << "Trace Op: " << type;
   if (FLAGS_use_mkldnn) {
     // if both lists are empty all ops are enabled (default for
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 93fd85f13cbf08..c002c7a10cb7b3 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -33,7 +33,7 @@ if (WITH_LITE)
   add_subdirectory(lite)
 endif()
 
-# fluid_modules exclude API-interface of inference/api and inference/capi
+# fluid_modules exclude API-interface of inference/api and inference/capi_exp
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 
 # Adapt to custom op mechanism: Include the header files related to the data type
@@ -61,7 +61,7 @@ if(NOT APPLE)
 endif()
 
 # C inference API
-add_subdirectory(capi)
+add_subdirectory(capi_exp)
 
 if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     add_subdirectory(tests/api)
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index bd27b1f5f34475..255c6ca75dfd74 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -213,6 +213,11 @@ struct Argument {
   DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
   DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool);
 
+  DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
+  DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);
+  DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int);
+  DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int);
+
   DECL_ARGUMENT_FIELD(lite_passes_filter, LitePassesFilter,
                       std::vector<std::string>);
   DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>);
@@ -222,6 +227,11 @@ struct Argument {
 
   DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
   DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int);
+  DECL_ARGUMENT_FIELD(xpu_locked, XpuLocked, bool);
+  DECL_ARGUMENT_FIELD(xpu_autotune, XpuAutotune, bool);
+  DECL_ARGUMENT_FIELD(xpu_autotune_file, XpuAutotuneFile, std::string);
+  DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string);
+  DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool);
 
   // Memory optimized related.
   DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index a4e263e2f464c4..4bb08dc96b1cf5 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -106,8 +106,8 @@ void IRPassManager::CreatePasses(Argument *argument,
       bool use_static_engine = argument->tensorrt_use_static_engine();
       bool model_from_memory = argument->model_from_memory();
       std::string optim_cache_dir = argument->optim_cache_dir();
-      bool int8_valid =
-          !(model_from_memory && optim_cache_dir.empty() && enable_int8);
+      bool int8_valid = !(model_from_memory && optim_cache_dir.empty() &&
+                          enable_int8 && use_calib_mode);
       PADDLE_ENFORCE_EQ(
           int8_valid, true,
           platform::errors::PreconditionNotMet(
@@ -166,6 +166,11 @@ void IRPassManager::CreatePasses(Argument *argument,
       // run fp16.
       pass->Set("disable_trt_plugin_fp16",
                 new bool(argument->disable_trt_plugin_fp16()));
+    } else if (pass_name == "dlnne_subgraph_pass") {
+      pass->Set("min_subgraph_size",
+                new int(argument->dlnne_min_subgraph_size()));
+      pass->Set("program",
+                new framework::ProgramDesc *(&argument->main_program()));
     }
     if (pass_name == "lite_subgraph_pass") {
       bool enable_int8 =
@@ -183,6 +188,12 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new int(argument->xpu_l3_workspace_size()));
       pass->Set("cpu_math_library_num_threads",
                 new int(argument->cpu_math_library_num_threads()));
+      pass->Set("locked", new bool(argument->xpu_locked()));
+      pass->Set("autotune", new bool(argument->xpu_autotune()));
+      pass->Set("autotune_file",
+                new std::string(argument->xpu_autotune_file()));
+      pass->Set("precision", new std::string(argument->xpu_precision()));
+      pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen()));
     }
     disable_logs_ = argument->disable_logs();
     if (pass_name == "fc_fuse_pass") {
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
index e35178428cc7ba..330f7a99847344 100644
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -20,3 +20,15 @@ if (WITH_LITE)
   set(INFER_IR_PASSES ${INFER_IR_PASSES} lite_subgraph_pass CACHE INTERNAL "")
   cc_test(lite_subgraph_pass_tester SRCS lite_subgraph_pass_tester.cc DEPS lite_subgraph_pass gtest glog)
 endif()
+
+MESSAGE("WITH_DLNNE:${WITH_DLNNE}")
+if(WITH_DLNNE)
+  cc_library(dlnne_subgraph_pass SRCS dlnne_subgraph_pass.cc DEPS ${analysis_deps} subgraph_util)
+  set(analysis_deps ${analysis_deps}
+        subgraph_util dlnne_subgraph_pass
+        CACHE INTERNAL "")
+
+  set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp)
+  file(APPEND ${pass_file} "USE_PASS(dlnne_subgraph_pass);\n")
+  set(INFER_IR_PASSES ${INFER_IR_PASSES} dlnne_subgraph_pass CACHE INTERNAL "")
+endif()
diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h b/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h
new file mode 100644
index 00000000000000..ae977c1403a879
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h
@@ -0,0 +1,21 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+namespace paddle {
+namespace inference {
+
+int RegisterPyFunc(const std::string& name, void* pfn);
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
new file mode 100644
index 00000000000000..8f789139af9bfc
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
@@ -0,0 +1,351 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+#include <map>
+#include <set>
+
+#include <fstream>
+#include <iostream>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/subgraph_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h"
+#include "paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace inference {
+
+int (*PyConvertGraph)(const char *graph_name);
+
+int RegisterPyFunc(const std::string &name, void *pfn) {
+  if (name.compare("convert_graph") == 0) {
+    PyConvertGraph = reinterpret_cast<decltype(PyConvertGraph)>(pfn);
+  }
+
+  return 0;
+}
+int ConvertGraph(std::string graph_name) {
+  LOG(INFO) << "starting doing convert_graph";
+
+  PyConvertGraph(graph_name.c_str());
+
+  return 0;
+}
+
+namespace analysis {
+
+using framework::ir::Node;
+
+void analysis::DlnneSubgraphPass::ApplyImpl(framework::ir::Graph *graph) const {
+  static std::unordered_set<std::string> teller_set{
+      "mul", "matmul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
+      "hard_swish", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
+      "elementwise_add", "elementwise_mul", "dropout", "prelu",
+      "conv2d_transpose", "leaky_relu",
+      // "fc",
+      "shuffle_channel", "swish", "split",
+      // "instance_norm",
+      "gelu",
+      // "layer_norm",
+      // "scale",
+      // "stack",
+      "relu6", "reshape2", "transpose2", "concat", "slice",
+  };
+
+  framework::ir::FusePassBase::Init("dlnne_subgraph_pass", graph);
+
+  auto teller = [&](const framework::ir::Node *node) {
+    if (!node->IsOp() || !node->Op()) return false;
+    return teller_set.find(node->Op()->Type()) != teller_set.end();
+  };
+
+  framework::ir::SubGraphFuser fuser(
+      graph, teller, Get<int>("min_subgraph_size") /*min subgraph size*/,
+      "dlnne_engine");
+  fuser();
+
+  std::vector<std::string> graph_param_names =
+      ExtractParameters(graph->Nodes());
+  // those parameter already exist in dlnne, and should not have another copy in
+  // fluid.
+  std::vector<std::string> repetitive_params;
+
+  for (auto *node : graph->Nodes()) {
+    if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) {
+      CreateDlnneOp(node, graph, graph_param_names, &repetitive_params);
+
+      std::unordered_set<const Node *> nodes2remove(
+          framework::ir::Agent(node).subgraph()->begin(),
+          framework::ir::Agent(node).subgraph()->end());
+      framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
+    }
+  }
+
+  std::unordered_set<const Node *> nodes2remove;
+  for (auto *node : graph->Nodes()) {
+    if (node->IsOp() && framework::ir::Agent(node).deleted()) {
+      nodes2remove.insert(node);
+    }
+  }
+  framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
+}
+
+std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
+                              const std::set<std::string> &engine_outputs,
+                              const std::string &predictor_id) {
+  std::string engine_hash_key = "";
+  for (auto name : engine_inputs) {
+    engine_hash_key += name;
+  }
+  for (auto name : engine_outputs) {
+    engine_hash_key += name;
+  }
+  engine_hash_key += predictor_id;
+  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
+  return engine_key;
+}
+std::string replace_name(std::string name, const char *raw,
+                         const char *new_char) {
+  std::string r_name = name;
+  int pos = r_name.find(raw);
+  while (pos >= 0) {
+    r_name = r_name.replace(pos, 1, new_char);
+    pos = r_name.find(raw);
+  }
+  return r_name;
+}
+
+void DlnneSubgraphPass::CreateDlnneOp(
+    framework::ir::Node *node, framework::ir::Graph *graph,
+    const std::vector<std::string> &graph_params,
+    std::vector<std::string> *repetitive_params) const {
+  auto *op_desc = node->Op();
+  auto &subgraph = *framework::ir::Agent(node).subgraph();
+  PADDLE_ENFORCE_EQ(subgraph.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "The subgraph should not be empty."));
+
+  // A fake block desc.
+  framework::proto::BlockDesc block_proto;
+  framework::BlockDesc block_desc(nullptr, &block_proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+  LOG(INFO) << "---  detect a sub-graph with " << subgraph.size() << " nodes";
+  // for debug
+  framework::ProgramDesc tmp_dump_program_desc;
+  auto *tmp_dump_main_block = tmp_dump_program_desc.MutableBlock(0);
+
+  std::unordered_map<std::string, framework::VarDesc *> name_var_desc;
+  std::set<std::string> name_var_input_nodes;
+  std::set<std::string> name_var_output_nodes;
+  std::set<std::string> name_ops;
+
+  for (auto *node : subgraph) {
+    auto *op = block_desc.AppendOp();
+    *op->Proto() = *node->Op()->Proto();
+
+    // debug
+    {
+      name_ops.insert(node->Name());
+      auto *tmp_dump_new_block_op = tmp_dump_main_block->AppendOp();
+
+      framework::OpDesc op_desc;
+      op_desc.CopyFrom(*node->Op());
+
+      for (auto argument_name : op_desc.InputArgumentNames()) {
+        if (std::count(graph_params.begin(), graph_params.end(),
+                       argument_name) > 0) {
+          op_desc.Rename(argument_name, replace_name(argument_name, "/", "."));
+        }
+      }
+      for (auto argument_name : op_desc.OutputArgumentNames()) {
+        if (std::count(graph_params.begin(), graph_params.end(),
+                       argument_name) > 0) {
+          op_desc.Rename(argument_name, replace_name(argument_name, "/", "."));
+        }
+      }
+      *tmp_dump_new_block_op->Proto() = *op_desc.Proto();
+
+      for (auto *x : node->inputs) {
+        if (x->IsVar()) {
+          name_var_desc[x->Name()] = x->Var();
+        }
+        if (std::count(graph_params.begin(), graph_params.end(), x->Name()) ==
+            0)
+          name_var_input_nodes.insert(x->Name());
+      }
+
+      for (auto *x : node->outputs) {
+        if (x->IsVar()) {
+          name_var_desc[x->Name()] = x->Var();
+        }
+        if (std::count(graph_params.begin(), graph_params.end(), x->Name()) ==
+            0)
+          name_var_output_nodes.insert(x->Name());
+      }
+    }
+  }
+  std::set<std::string> valid_input_names;
+  std::set<std::string> valid_output_names;
+  for (auto name : name_var_output_nodes) {
+    if (name_var_input_nodes.find(name) == name_var_input_nodes.end()) {
+      valid_output_names.insert(name);
+    }
+  }
+
+  for (auto name : name_var_input_nodes) {
+    if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) {
+      valid_input_names.insert(name);
+    }
+  }
+
+  // Then, we will use the input_names_with_id and output_names_with_id to
+  // generate the engine key.
+  // So, We use set instead of unordered_set here to ensure that the engine key
+  // is unique.
+  std::set<std::string> input_names;
+  std::set<std::string> input_names_with_id;
+  std::vector<std::string> params;
+  // if we delete fluid copy of params shared by more than 1 ops, there will be
+  // problem, so we filter them out.
+
+  // The node->inputs contains input tensors and parameters.
+  for (auto *x : node->inputs) {
+    input_names.insert(x->Name());
+    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
+    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
+      params.push_back(x->Name());
+    }
+  }
+
+  std::set<std::string> output_names;
+  std::set<std::string> output_names_with_id;
+  std::vector<int> origin_output_dims;
+  for (auto *x : node->outputs) {
+    origin_output_dims.push_back(x->Var()->GetShape().size());
+    output_names.insert(x->Name());
+    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
+  }
+
+  std::unordered_map<std::string, std::string> output_name_map;
+  std::unordered_map<std::string, framework::ir::Node *> graph_var_map;
+
+  for (framework::ir::Node *node : graph->Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      graph_var_map[node->Name()] = node;
+    }
+  }
+
+  // Set attrs
+  op_desc->SetType("dlnne_engine");
+  op_desc->SetInput("Xs", std::vector<std::string>(valid_input_names.begin(),
+                                                   valid_input_names.end()));
+
+  op_desc->SetOutput("Ys", std::vector<std::string>(valid_output_names.begin(),
+                                                    valid_output_names.end()));
+
+  op_desc->SetAttr("parameters", params);
+  auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
+                                      std::to_string(0));
+  op_desc->SetAttr("engine_key", engine_key);
+  auto *scope = param_scope();
+
+  {
+    std::set<std::string> input_names;
+
+    for (auto name : name_var_input_nodes) {
+      if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) {
+        input_names.insert(name);
+      }
+    }
+
+    // add feed to subgraph:
+    int input_idx = 0;
+    for (auto input_name : input_names) {
+      auto *feed0 = tmp_dump_main_block->AppendOp();
+      feed0->SetType("feed");
+      feed0->SetInput("X", {"feed"});
+      feed0->SetOutput("Out", {input_name});
+      feed0->SetAttr("col", input_idx);
+      input_idx++;
+    }
+    // add fetch to subgraph:
+    int output_idx = 0;
+    for (auto output_name : valid_output_names) {
+      auto *fetch0 = tmp_dump_main_block->AppendOp();
+      fetch0->SetType("fetch");
+      fetch0->SetInput("X", {output_name});
+      fetch0->SetOutput("Out", {"out"});
+      fetch0->SetAttr("col", output_idx);
+      output_idx++;
+    }
+
+    mkdir("./dump", 0777);
+    std::string dir_name = "./dump/" + engine_key;
+    mkdir(dir_name.c_str(), 0777);
+    ofstream m_stream;
+    m_stream.open(dir_name + "/__model__", ios::out);
+
+    VLOG(4) << "name_var_desc size:" << name_var_desc.size();
+
+    for (auto &kv : name_var_desc) {
+      auto *new_add_var = tmp_dump_main_block->Proto()->add_vars();
+      *new_add_var = *kv.second->Proto();
+      auto *variable_tmp = scope->FindVar(kv.first);
+      if (variable_tmp != nullptr) {
+        *new_add_var->mutable_name() = replace_name(kv.first, "/", ".");
+        new_add_var->set_persistable(true);
+      } else {
+        new_add_var->set_persistable(false);
+      }
+    }
+
+    for (auto param_name : params) {
+      auto *var = scope->FindVar(param_name);
+      if (var != nullptr) {
+        auto *var_t = var->GetMutable<framework::LoDTensor>();
+        ofstream p_stream;
+        p_stream.open(dir_name + "/" + replace_name(param_name, "/", "."),
+                      ios::out);
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(var_t->place());
+        framework::SerializeToStream(p_stream, *var_t, dev_ctx);
+        p_stream.close();
+      }
+    }
+
+    std::string model;
+
+    tmp_dump_program_desc.Proto()->SerializeToString(&model);
+    m_stream << model;
+    m_stream.close();
+
+    op_desc->SetBlockAttr("sub_block", tmp_dump_main_block);
+    op_desc->SetAttr("subgraph", model);
+    op_desc->Flush();
+
+    ConvertGraph(engine_key);
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_PASS(dlnne_subgraph_pass,
+              paddle::inference::analysis::DlnneSubgraphPass);
diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h
new file mode 100644
index 00000000000000..5a1d2506fdb09b
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+
+int ConvertGraph(std::string graph_name);
+
+namespace analysis {
+
+class DlnneSubgraphPass : public framework::ir::FusePassBase {
+ public:
+  void ApplyImpl(framework::ir::Graph *graph) const override;
+
+ private:
+  void CleanIntermediateOutputs(framework::ir::Node *node);
+  void CreateDlnneOp(framework::ir::Node *x, framework::ir::Graph *graph,
+                     const std::vector<std::string> &graph_params,
+                     std::vector<std::string> *repetitive_params) const;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index c697914904b3e9..b8cac8992f4eed 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -245,6 +245,11 @@ void LiteSubgraphPass::SetUpEngine(
   bool use_xpu = Get<bool>("use_xpu");
   int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
   int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
+  bool locked = Get<bool>("locked");
+  bool autotune = Get<bool>("autotune");
+  std::string autotune_file = Get<std::string>("autotune_file");
+  std::string precision = Get<std::string>("precision");
+  bool adaptive_seqlen = Get<bool>("adaptive_seqlen");
 
   lite_api::TargetType target_type;
   if (use_gpu) {
@@ -282,6 +287,11 @@ void LiteSubgraphPass::SetUpEngine(
   };
   config.cpu_math_library_num_threads = cpu_math_library_num_threads;
   config.xpu_l3_workspace_size = xpu_l3_workspace_size;
+  config.locked = locked;
+  config.autotune = autotune;
+  config.autotune_file = autotune_file;
+  config.precision = precision;
+  config.adaptive_seqlen = adaptive_seqlen;
   if (dump_model) {
     lite::StrToBinaryFile("./model.bin", config.model);
     lite::StrToBinaryFile("./param.bin", config.param);
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 60de4234b41a8b..f57f07883dcd70 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
 
 namespace paddle {
@@ -321,11 +322,20 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
     opt_input_shape = {};
   }
 
-  if (min_input_shape.size() > 0 && TRT_VERSION > 6000) {
+  auto to_major_version = [&](int full_version) -> float {
+    return (full_version / 100) / 10.0;
+  };
+  const float compile_time_trt_version = to_major_version(TRT_VERSION);
+  const float run_time_trt_version =
+      to_major_version(tensorrt::GetInferLibVersion());
+  if (compile_time_trt_version != run_time_trt_version) {
     LOG_FIRST_N(WARNING, 1)
-        << "The Paddle lib links the " << TRT_VERSION << " version TensorRT, "
-        << "make sure the runtime TensorRT you are using is no less than this "
-           "version, otherwise, there might be Segfault!";
+        << "The Paddle Inference library is compiled with "
+        << compile_time_trt_version << " version TensorRT, "
+        << "but the runtime TensorRT you are using is " << run_time_trt_version
+        << " version. "
+           "This might cause serious compatibility issues. We strongly "
+           "recommend using the same TRT version at runtime.";
   }
 
   // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will not
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 03f86cc7ba6de6..82c95ba2c95712 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -32,10 +32,10 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 
 if(WITH_CRYPTO)
     cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array 
-              analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto)
+              analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
 else()
     cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array 
-              analysis_config zero_copy_tensor trainer_desc_proto)
+              analysis_config zero_copy_tensor trainer_desc_proto custom_operator)
 endif()
 
 if(WIN32)
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 0622fb27d9e38c..853c1ac1da8742 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -26,6 +26,7 @@ namespace paddle {
 struct MkldnnQuantizerConfig;
 
 extern const std::vector<std::string> kTRTSubgraphPasses;
+extern const std::vector<std::string> kDlnneSubgraphPasses;
 extern const std::vector<std::string> kLiteSubgraphPasses;
 
 PassStrategy *AnalysisConfig::pass_builder() const {
@@ -95,9 +96,17 @@ void AnalysisConfig::DisableFCPadding() {
   Update();
 }
 
-void AnalysisConfig::EnableXpu(int l3_workspace_size) {
+void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked,
+                               bool autotune, const std::string &autotune_file,
+                               const std::string &precision,
+                               bool adaptive_seqlen) {
   use_xpu_ = true;
   xpu_l3_workspace_size_ = l3_workspace_size;
+  xpu_locked_ = locked;
+  xpu_autotune_ = autotune;
+  xpu_autotune_file_ = autotune_file;
+  xpu_precision_ = precision;
+  xpu_adaptive_seqlen_ = adaptive_seqlen;
   Update();
 }
 
@@ -134,6 +143,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(trt_use_static_engine_);
   CP_MEMBER(trt_use_calib_mode_);
   CP_MEMBER(trt_use_oss_);
+  // Dlnne related
+  CP_MEMBER(use_dlnne_);
+  CP_MEMBER(dlnne_min_subgraph_size_);
   // MKLDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
@@ -157,6 +169,11 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 
   CP_MEMBER(use_xpu_);
   CP_MEMBER(xpu_l3_workspace_size_);
+  CP_MEMBER(xpu_locked_);
+  CP_MEMBER(xpu_autotune_);
+  CP_MEMBER(xpu_autotune_file_);
+  CP_MEMBER(xpu_precision_);
+  CP_MEMBER(xpu_adaptive_seqlen_);
 
   // profile related.
   CP_MEMBER(with_profile_);
@@ -211,6 +228,21 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
       pass_builder_->DeletePass(ps);
     }
   }
+  if (use_dlnne_) {
+    auto all_passes = kDlnneSubgraphPasses;
+    auto other_passes = other.pass_builder()->AllPasses();
+    // We should sort them, because the user may call the SwitchIrDebug
+    // interface, which will change the pass.
+    std::sort(all_passes.begin(), all_passes.end());
+    std::sort(other_passes.begin(), other_passes.end());
+    std::vector<std::string> deleted_passes;
+    std::set_difference(all_passes.begin(), all_passes.end(),
+                        other_passes.begin(), other_passes.end(),
+                        std::inserter(deleted_passes, deleted_passes.begin()));
+    for (auto ps : deleted_passes) {
+      pass_builder_->DeletePass(ps);
+    }
+  }
 }
 
 void AnalysisConfig::EnableCUDNN() {
@@ -309,6 +341,12 @@ void AnalysisConfig::EnableTensorRtEngine(
 #endif
 }
 
+void AnalysisConfig::EnableDlnne(int min_subgraph_size) {
+  use_dlnne_ = true;
+  dlnne_min_subgraph_size_ = min_subgraph_size;
+  Update();
+}
+
 void AnalysisConfig::SetTRTDynamicShapeInfo(
     std::map<std::string, std::vector<int>> min_input_shape,
     std::map<std::string, std::vector<int>> max_input_shape,
@@ -383,6 +421,14 @@ void AnalysisConfig::Update() {
       pass_builder()->AppendPass(pass);
     }
   }
+  LOG(INFO) << "use_dlnne_:" << use_dlnne_ << std::endl;
+  if (use_dlnne_) {
+    pass_builder()->ClearPasses();
+    for (const auto &pass : kDlnneSubgraphPasses) {
+      pass_builder()->AppendPass(pass);
+    }
+  }
+
   if (use_gpu() && use_cudnn_) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (!enable_ir_optim_) {
@@ -479,6 +525,9 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << tensorrt_max_batchsize_;
   ss << tensorrt_min_subgraph_size_;
 
+  ss << use_dlnne_;
+  ss << dlnne_min_subgraph_size_;
+
   for (auto &op : trt_disabled_ops_) ss << op.c_str();
   ss << ";";
 
@@ -512,6 +561,11 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << use_lite_;
   ss << use_xpu_;
   ss << xpu_l3_workspace_size_;
+  ss << xpu_locked_;
+  ss << xpu_autotune_;
+  ss << xpu_autotune_file_;
+  ss << xpu_precision_;
+  ss << xpu_adaptive_seqlen_;
 
   ss << thread_local_stream_;
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 4b6c746d57525a..89c8c7902bac9f 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -191,22 +191,8 @@ bool AnalysisPredictor::PrepareScope(
     status_is_cloned_ = true;
   } else {
     paddle::framework::InitDevices();
-    scope_.reset(new paddle::framework::Scope(), [](framework::Scope *scope) {
-      delete scope;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-      for (int dev_id = 0; dev_id < paddle::platform::GetCUDADeviceCount();
-           ++dev_id) {
-        memory::Release(platform::CUDAPlace(dev_id));
-      }
-#endif
-#ifdef PADDLE_WITH_XPU
-      for (int dev_id = 0; dev_id < paddle::platform::GetXPUDeviceCount();
-           ++dev_id) {
-        memory::Release(platform::XPUPlace(dev_id));
-      }
-#endif
-      memory::Release(platform::CPUPlace());
-    });
+    // TODO(wilber): we need to release memory occupied by weights.
+    scope_.reset(new paddle::framework::Scope());
     status_is_cloned_ = false;
   }
   sub_scope_ = &scope_->NewScope();
@@ -537,6 +523,12 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
   }
 
+  if (config_.dlnne_enabled()) {
+    LOG(INFO) << "Dlnne subgraph is enabled";
+    argument_.SetUseDlnne(true);
+    argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_);
+  }
+
   if (config_.lite_engine_enabled()) {
     argument_.SetCpuMathLibraryNumThreads(
         config_.cpu_math_library_num_threads());
@@ -546,6 +538,11 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetLiteZeroCopy(config_.lite_zero_copy_);
     argument_.SetUseXpu(config_.use_xpu_);
     argument_.SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
+    argument_.SetXpuLocked(config_.xpu_locked_);
+    argument_.SetXpuAutotune(config_.xpu_autotune_);
+    argument_.SetXpuAutotuneFile(config_.xpu_autotune_file_);
+    argument_.SetXpuPrecision(config_.xpu_precision_);
+    argument_.SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
     LOG(INFO) << "Lite subgraph engine is enabled";
   }
 
@@ -617,7 +614,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   // This function can only be executed once per process.
   static std::once_flag custom_operators_registered;
   std::call_once(custom_operators_registered,
-                 []() { paddle::RegisterAllCustomOperator(); });
+                 []() { inference::RegisterAllCustomOperator(); });
 
   if (config.use_gpu()) {
     static std::once_flag gflags_initialized;
diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc
index 9cc491e10d691a..d78560239de50e 100644
--- a/paddle/fluid/inference/api/helper.cc
+++ b/paddle/fluid/inference/api/helper.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/extension/include/ext_op_meta_info.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace inference {
@@ -40,5 +43,20 @@ std::string to_string<std::vector<std::vector<float>>>(
   return ss.str();
 }
 
+void RegisterAllCustomOperator() {
+  auto &op_meta_info_map = OpMetaInfoMap::Instance();
+  const auto &meta_info_map = op_meta_info_map.GetMap();
+  for (auto &pair : meta_info_map) {
+    const auto &all_op_kernels{framework::OperatorWithKernel::AllOpKernels()};
+    if (all_op_kernels.find(pair.first) == all_op_kernels.end()) {
+      framework::RegisterOperatorWithMetaInfo(pair.second);
+    } else {
+      LOG(INFO) << "The operator `" << pair.first
+                << "` has been registered. "
+                   "Therefore, we will not repeat the registration here.";
+    }
+  }
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 14b968f5834da8..c6d25137594b76 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -398,5 +398,7 @@ static bool IsFileExists(const std::string &path) {
   return exists;
 }
 
+void RegisterAllCustomOperator();
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 793fc53d90b768..f6cdbb00b50453 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -411,7 +411,8 @@ void AnalysisPredictor::MkldnnQuantizer::ClearDeviceContext() const {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   platform::MKLDNNDeviceContext* dev_ctx =
       (platform::MKLDNNDeviceContext*)pool.Get(predictor_.place_);
-  dev_ctx->ResetBlobMap();
+  dev_ctx->ResetBlobMap(
+      paddle::platform::MKLDNNDeviceContext::tls().get_curr_exec());
 }
 
 void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index e492b32cb6cbef..2bbd4bb837a22f 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -177,7 +177,10 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void DisableGpu();
 
-  void EnableXpu(int l3_workspace_size = 0xfffc00);
+  void EnableXpu(int l3_workspace_size = 0xfffc00, bool locked = false,
+                 bool autotune = true, const std::string& autotune_file = "",
+                 const std::string& precision = "int16",
+                 bool adaptive_seqlen = false);
   ///
   /// \brief A boolean state telling whether the GPU is turned on.
   ///
@@ -360,6 +363,9 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   bool tensorrt_dla_enabled() { return trt_use_dla_; }
 
+  void EnableDlnne(int min_subgraph_size = 3);
+  bool dlnne_enabled() const { return use_dlnne_; }
+
   ///
   /// \brief Turn on the usage of Lite sub-graph engine.
   ///
@@ -627,6 +633,10 @@ struct PD_INFER_DECL AnalysisConfig {
   std::vector<std::string> trt_disabled_ops_{};
   bool disable_trt_plugin_fp16_{false};
 
+  // dlnne related.
+  bool use_dlnne_{false};
+  int dlnne_min_subgraph_size_{3};
+
   // memory reuse related.
   bool enable_memory_optim_{false};
 
@@ -661,6 +671,11 @@ struct PD_INFER_DECL AnalysisConfig {
   bool thread_local_stream_{false};
   bool use_xpu_{false};
   int xpu_l3_workspace_size_;
+  bool xpu_locked_;
+  bool xpu_autotune_;
+  std::string xpu_autotune_file_;
+  std::string xpu_precision_;
+  bool xpu_adaptive_seqlen_;
 
   // mkldnn related.
   int mkldnn_cache_capacity_{0};
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 1d77ddaf73ef70..b2e3de63691c55 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -110,6 +110,16 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "transpose_flatten_concat_fuse_pass",
 });
 
+const std::vector<std::string> kDlnneSubgraphPasses({
+    "is_test_pass",                  //
+    "delete_dropout_op_pass"         //
+    "simplify_with_basic_ops_pass",  //
+    "conv_bn_fuse_pass",             //
+    "depthwise_conv_bn_fuse_pass",   //
+    "shuffle_channel_detect_pass",   //
+    "dlnne_subgraph_pass",           //
+});
+
 const std::vector<std::string> kLiteSubgraphPasses({
 #ifdef PADDLE_WITH_LITE
     "lite_subgraph_pass",
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index a725ebab35eada..d7556b50031b7d 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -242,6 +242,9 @@ class PD_INFER_DECL XpuPassStrategy final : public PassStrategy {
 /// \brief List of tensorRT subgraph passes.
 PD_INFER_DECL extern const std::vector<std::string> kTRTSubgraphPasses;
 
+/// \brief List of dlnne subgraph passes.
+PD_INFER_DECL extern const std::vector<std::string> kDlnneSubgraphPasses;
+
 /// \brief List of lite subgraph passes.
 PD_INFER_DECL extern const std::vector<std::string> kLiteSubgraphPasses;
 
diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc
index 231639667244d8..9bb52ba5780251 100644
--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -260,6 +260,22 @@ bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) {
   return config->config.tensorrt_engine_enabled();
 }
 
+void PD_EnableDlnne(PD_AnalysisConfig* config, int min_subgraph_size) {
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
+  config->config.EnableDlnne(min_subgraph_size);
+}
+
+bool PD_DlnneEnabled(const PD_AnalysisConfig* config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
+  return config->config.dlnne_enabled();
+}
+
 void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) {
   PADDLE_ENFORCE_NOT_NULL(
       config,
diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt
new file mode 100644
index 00000000000000..521d24329d4641
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set(C_API_SRCS pd_config.cc pd_predictor.cc pd_tensor.cc pd_utils.cc)
+
+cc_library(paddle_inference_c SRCS ${C_API_SRCS} DEPS paddle_inference)
+
+if(NOT ON_INFER)
+    return()
+endif()
+
+# Create inference capi shared library
+cc_library(paddle_inference_c_shared SHARED SRCS ${C_API_SRCS} DEPS paddle_inference)
+set_target_properties(paddle_inference_c_shared PROPERTIES OUTPUT_NAME paddle_inference_c)
+if(WIN32)
+    target_link_libraries(paddle_inference_c_shared shlwapi.lib)
+endif()
diff --git a/paddle/fluid/inference/capi_exp/lod_demo.cc b/paddle/fluid/inference/capi_exp/lod_demo.cc
new file mode 100644
index 00000000000000..2b049e992e71dd
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/lod_demo.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file lod_demo.cc
+///
+/// \brief a demo for user to learn how to inference by c api.
+///  it rectify from
+///  paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc.
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+
+int main(int argc, char *argv[]) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config *config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/param").c_str());
+  PD_ConfigDisableGpu(config);
+
+  PD_Predictor *predictor = PD_PredictorCreate(config);
+  size_t input_num = PD_PredictorGetInputNum(predictor);
+  size_t output_num = PD_PredictorGetOutputNum(predictor);
+
+  PD_OneDimArrayCstr *input_names = PD_PredictorGetInputNames(predictor);
+  LOG(INFO) << "Predictor start run!";
+  PD_Tensor *inputs[2];
+  inputs[0] = PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  inputs[1] = PD_PredictorGetInputHandle(predictor, input_names->data[1]);
+  LOG(INFO) << "Predictor start run!";
+  // inputs[0]: word, use lod memory in stack
+  int32_t shape_0[2] = {11, 1};
+  int64_t data_0[11 * 1] = {12673, 9763, 905, 284, 45, 7474, 20, 17, 1, 4, 9};
+  size_t lod_layer_0[2] = {0, 11};
+  PD_OneDimArraySize layer_0;
+  layer_0.size = 2;
+  layer_0.data = lod_layer_0;
+  PD_OneDimArraySize *layer_0_ptr = &layer_0;
+  PD_TwoDimArraySize lod_0;
+  lod_0.size = 1;
+  lod_0.data = &layer_0_ptr;
+  PD_TensorReshape(inputs[0], 2, shape_0);
+  PD_TensorCopyFromCpuInt64(inputs[0], data_0);
+  PD_TensorSetLod(inputs[0], &lod_0);
+
+  // inputs[1]: mention, use lod memory in heap
+  int32_t shape_1[2] = {11, 1};
+  int64_t data_1[11 * 1] = {27, 0, 0, 33, 34, 33, 0, 0, 0, 1, 2};
+  PD_TwoDimArraySize *lod_1_ptr = new PD_TwoDimArraySize();
+  lod_1_ptr->size = 1;
+  lod_1_ptr->data = new PD_OneDimArraySize *[1];
+  lod_1_ptr->data[0] = new PD_OneDimArraySize();
+  lod_1_ptr->data[0]->size = 2;
+  lod_1_ptr->data[0]->data = new size_t[2];
+  lod_1_ptr->data[0]->data[0] = 0;
+  lod_1_ptr->data[0]->data[1] = 11;
+
+  PD_TensorReshape(inputs[1], 2, shape_1);
+  PD_TensorCopyFromCpuInt64(inputs[1], data_1);
+  PD_TensorSetLod(inputs[1], lod_1_ptr);
+  // retrieve the lod memory
+  delete[] lod_1_ptr->data[0]->data;
+  delete lod_1_ptr->data[0];
+  delete[] lod_1_ptr->data;
+  delete lod_1_ptr;
+  lod_1_ptr = nullptr;
+
+  PD_PredictorRun(predictor);
+  PD_OneDimArrayCstr *output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor *output =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_TwoDimArraySize *output_lod = PD_TensorGetLod(output);
+
+  PD_TwoDimArraySizeDestroy(output_lod);
+  PD_TensorDestroy(output);
+  PD_OneDimArrayCstrDestroy(output_names);
+
+  PD_TensorDestroy(inputs[0]);
+  PD_TensorDestroy(inputs[1]);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
diff --git a/paddle/fluid/inference/capi_exp/pd_common.h b/paddle/fluid/inference/capi_exp/pd_common.h
new file mode 100644
index 00000000000000..4b70ed7fbad297
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_common.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#if defined(_WIN32)
+#ifdef PADDLE_DLL_INFERENCE
+#define PADDLE_CAPI_EXPORT __declspec(dllexport)
+#else
+#define PADDLE_CAPI_EXPORT __declspec(dllimport)
+#endif  // PADDLE_DLL_INFERENCE
+#else
+#define PADDLE_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+
+///
+/// __pd_give means that a new object is returned. The user should make sure
+/// that the returned pointer is used exactly once as a value for an __pd_take
+/// argument. In between, it can be used as a value for as many __pd_keep
+/// arguments as the user likes.
+///
+#ifndef __pd_give
+#define __pd_give
+#endif
+///
+/// __pd_take means that the object the argument points to is taken over by the
+/// function and may no longer be used by the user as an argument to any other
+/// function. The pointer value must be one returned by a function returning an
+/// __pd_give pointer.
+///
+#ifndef __pd_take
+#define __pd_take
+#endif
+///
+/// __pd_keep means that the function will only use the object temporarily. The
+/// object which the argument points to is not taken over by the function. After
+/// the function has finished, the user can still use it as an argument to other
+/// functions.
+///
+#ifndef __pd_keep
+#define __pd_keep
+#endif
+
+typedef int8_t PD_Bool;
+#define TRUE 1
+#define FALSE 0
+
+#define PD_ENUM(type)   \
+  typedef int32_t type; \
+  enum
+
+PD_ENUM(PD_PrecisionType){PD_PRECISION_FLOAT32 = 0, PD_PRECISION_INT8,
+                          PD_PRECISION_HALF};
+
+PD_ENUM(PD_PlaceType){PD_PLACE_UNK = -1, PD_PLACE_CPU, PD_PLACE_GPU,
+                      PD_PLACE_XPU};
+
+PD_ENUM(PD_DataType){
+    PD_DATA_UNK = -1, PD_DATA_FLOAT32, PD_DATA_INT32,
+    PD_DATA_INT64,    PD_DATA_UINT8,
+};
diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc
new file mode 100644
index 00000000000000..c45454e86bdaac
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -0,0 +1,382 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/capi_exp/pd_config.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define CHECK_NULL_POINTER_PARM(param)                  \
+  PADDLE_ENFORCE_NOT_NULL(                              \
+      param, paddle::platform::errors::InvalidArgument( \
+                 "The pointer of " #param " shouldn't be nullptr"))
+
+#define CHECK_AND_CONVERT_PD_CONFIG                                         \
+  PADDLE_ENFORCE_NOT_NULL(                                                  \
+      pd_config, paddle::platform::errors::InvalidArgument(                 \
+                     "The pointer of paddle config shouldn't be nullptr")); \
+  Config* config = reinterpret_cast<Config*>(pd_config)
+
+using paddle_infer::Config;
+
+static Config::Precision ConvertToCxxPrecisionType(PD_PrecisionType precision) {
+  switch (precision) {
+    case PD_PRECISION_FLOAT32:
+      return Config::Precision::kFloat32;
+    case PD_PRECISION_INT8:
+      return Config::Precision::kInt8;
+    case PD_PRECISION_HALF:
+      return Config::Precision::kHalf;
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unsupport paddle precision type %d.", precision));
+      return Config::Precision::kFloat32;
+  }
+}
+
+extern "C" {
+__pd_give PD_Config* PD_ConfigCreate() {
+  return reinterpret_cast<PD_Config*>(new Config());
+}
+
+void PD_ConfigDestroy(__pd_take PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  delete reinterpret_cast<Config*>(config);
+}
+
+void PD_ConfigSetModel(__pd_keep PD_Config* pd_config,
+                       const char* prog_file_path,
+                       const char* params_file_path) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(prog_file_path);
+  CHECK_NULL_POINTER_PARM(params_file_path);
+  config->SetModel(prog_file_path, params_file_path);
+}
+void PD_ConfigSetProgFile(__pd_keep PD_Config* pd_config,
+                          const char* prog_file_path) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(prog_file_path);
+  config->SetProgFile(prog_file_path);
+}
+void PD_ConfigSetParamsFile(__pd_keep PD_Config* pd_config,
+                            const char* params_file_path) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(params_file_path);
+  config->SetParamsFile(params_file_path);
+}
+void PD_ConfigSetOptimCacheDir(__pd_keep PD_Config* pd_config,
+                               const char* opt_cache_dir) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(opt_cache_dir);
+  config->SetOptimCacheDir(opt_cache_dir);
+}
+
+void PD_ConfigSetModelDir(__pd_keep PD_Config* pd_config,
+                          const char* model_dir) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(model_dir);
+  config->SetModel(model_dir);
+}
+const char* PD_ConfigGetModelDir(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->model_dir().c_str();
+}
+const char* PD_ConfigGetProgFile(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->prog_file().c_str();
+}
+const char* PD_ConfigGetParamsFile(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->params_file().c_str();
+}
+
+void PD_ConfigDisableFCPadding(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->DisableFCPadding();
+}
+PD_Bool PD_ConfigUseFcPadding(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_fc_padding();
+}
+
+void PD_ConfigEnableUseGpu(__pd_keep PD_Config* pd_config,
+                           uint64_t memory_pool_init_size_mb,
+                           int32_t device_id) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableUseGpu(memory_pool_init_size_mb, device_id);
+}
+void PD_ConfigDisableGpu(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->DisableGpu();
+}
+PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_gpu();
+}
+
+void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
+                        int32_t l3_workspace_size) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableXpu(l3_workspace_size);
+}
+PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_xpu();
+}
+
+int32_t PD_ConfigGpuDeviceId(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->gpu_device_id();
+}
+int32_t PD_ConfigXpuDeviceId(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->xpu_device_id();
+}
+int32_t PD_ConfigMemoryPoolInitSizeMb(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->memory_pool_init_size_mb();
+}
+float PD_ConfigFractionOfGpuMemoryForPool(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->fraction_of_gpu_memory_for_pool();
+}
+void PD_ConfigEnableCudnn(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableCUDNN();
+}
+PD_Bool PD_ConfigCudnnEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->cudnn_enabled();
+}
+
+void PD_ConfigSwitchIrOptim(__pd_keep PD_Config* pd_config, PD_Bool x) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SwitchIrOptim(x);
+}
+PD_Bool PD_ConfigIrOptim(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->ir_optim();
+}
+
+void PD_ConfigEnableTensorRtEngine(__pd_keep PD_Config* pd_config,
+                                   int32_t workspace_size,
+                                   int32_t max_batch_size,
+                                   int32_t min_subgraph_size,
+                                   PD_PrecisionType precision,
+                                   PD_Bool use_static, PD_Bool use_calib_mode) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableTensorRtEngine(
+      workspace_size, max_batch_size, min_subgraph_size,
+      ConvertToCxxPrecisionType(precision), use_static, use_calib_mode);
+}
+PD_Bool PD_ConfigTensorRtEngineEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->tensorrt_engine_enabled();
+}
+
+void PD_ConfigSetTrtDynamicShapeInfo(__pd_keep PD_Config* pd_config,
+                                     size_t tensor_num,
+                                     const char** tensor_name,
+                                     size_t* shapes_num, int32_t** min_shape,
+                                     int32_t** max_shape, int32_t** optim_shape,
+                                     PD_Bool disable_trt_plugin_fp16) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::map<std::string, std::vector<int>> min_input_shapes;
+  std::map<std::string, std::vector<int>> max_input_shapes;
+  std::map<std::string, std::vector<int>> optim_input_shapes;
+  for (size_t tensor_index = 0; tensor_index < tensor_num; ++tensor_index) {
+    std::string name(tensor_name[tensor_index]);
+    std::vector<int> min_input_shape, max_input_shape, optim_input_shape;
+    for (size_t shape_index = 0; shape_index < shapes_num[tensor_index];
+         ++shape_index) {
+      min_input_shape.emplace_back(min_shape[tensor_index][shape_index]);
+      max_input_shape.emplace_back(max_shape[tensor_index][shape_index]);
+      optim_input_shape.emplace_back(optim_shape[tensor_index][shape_index]);
+    }
+    min_input_shapes[name] = std::move(min_input_shape);
+    max_input_shapes[name] = std::move(max_input_shape);
+    optim_input_shapes[name] = std::move(optim_input_shape);
+  }
+  config->SetTRTDynamicShapeInfo(min_input_shapes, max_input_shapes,
+                                 optim_input_shapes, disable_trt_plugin_fp16);
+}
+
+void PD_ConfigDisableTensorRtOPs(__pd_keep PD_Config* pd_config, size_t ops_num,
+                                 const char** ops_name) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::vector<std::string> ops_list;
+  for (size_t index = 0; index < ops_num; ++index) {
+    ops_list.emplace_back(ops_name[index]);
+  }
+  config->Exp_DisableTensorRtOPs(ops_list);
+}
+
+void PD_ConfigEnableTensorRtOSS(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableTensorRtOSS();
+}
+PD_Bool PD_ConfigTensorRtOssEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->tensorrt_oss_enabled();
+}
+
+void PD_ConfigEnableTensorRtDla(__pd_keep PD_Config* pd_config,
+                                int32_t dla_core) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableTensorRtDLA(dla_core);
+}
+PD_Bool PD_ConfigTensorRtDlaEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->tensorrt_dla_enabled();
+}
+
+void PD_ConfigEnableLiteEngine(__pd_keep PD_Config* pd_config,
+                               PD_PrecisionType precision, PD_Bool zero_copy,
+                               size_t passes_filter_num,
+                               const char** passes_filter,
+                               size_t ops_filter_num, const char** ops_filter) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::vector<std::string> passes_filters, ops_filters;
+  for (size_t index = 0; index < passes_filter_num; ++index) {
+    passes_filters.emplace_back(passes_filter[index]);
+  }
+  for (size_t index = 0; index < ops_filter_num; ++index) {
+    ops_filters.emplace_back(ops_filter[index]);
+  }
+  config->EnableLiteEngine(ConvertToCxxPrecisionType(precision), zero_copy,
+                           passes_filters, ops_filters);
+}
+PD_Bool PD_ConfigLiteEngineEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->lite_engine_enabled();
+}
+
+void PD_ConfigSwitchIrDebug(__pd_keep PD_Config* pd_config, PD_Bool x) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SwitchIrDebug(x);
+}
+void PD_ConfigEnableMKLDNN(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMKLDNN();
+}
+void PD_ConfigSetMkldnnCacheCapacity(__pd_keep PD_Config* pd_config,
+                                     int32_t capacity) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SetMkldnnCacheCapacity(capacity);
+}
+PD_Bool PD_ConfigMkldnnEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->mkldnn_enabled();
+}
+void PD_ConfigSetCpuMathLibraryNumThreads(
+    __pd_keep PD_Config* pd_config, int32_t cpu_math_library_num_threads) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SetCpuMathLibraryNumThreads(cpu_math_library_num_threads);
+}
+int32_t PD_ConfigGetCpuMathLibraryNumThreads(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->cpu_math_library_num_threads();
+}
+
+void PD_ConfigSetMkldnnOp(__pd_keep PD_Config* pd_config, size_t ops_num,
+                          const char** op_list) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::unordered_set<std::string> op_names;
+  for (size_t index = 0; index < ops_num; ++index) {
+    op_names.emplace(op_list[index]);
+  }
+  config->SetMKLDNNOp(std::move(op_names));
+}
+void PD_ConfigEnableMkldnnQuantizer(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMkldnnQuantizer();
+}
+void PD_ConfigEnableMkldnnBfloat16(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMkldnnBfloat16();
+}
+PD_Bool PD_ConfigMkldnnBfloat16Enabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->mkldnn_bfloat16_enabled();
+}
+void PD_ConfigSetBfloat16Op(__pd_keep PD_Config* pd_config, size_t ops_num,
+                            const char** op_list) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::unordered_set<std::string> op_names;
+  for (size_t index = 0; index < ops_num; ++index) {
+    op_names.emplace(op_list[index]);
+  }
+  config->SetBfloat16Op(std::move(op_names));
+}
+PD_Bool PD_ConfigThreadLocalStreamEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->thread_local_stream_enabled();
+}
+PD_Bool PD_ConfigMkldnnQuantizerEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->mkldnn_quantizer_enabled();
+}
+void PD_ConfigSetModelBuffer(__pd_keep PD_Config* pd_config,
+                             const char* prog_buffer, size_t prog_buffer_size,
+                             const char* params_buffer,
+                             size_t params_buffer_size) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer,
+                         params_buffer_size);
+}
+PD_Bool PD_ConfigModelFromMemory(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->model_from_memory();
+}
+void PD_ConfigEnableMemoryOptim(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMemoryOptim();
+}
+PD_Bool PD_ConfigMemoryOptimEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->enable_memory_optim();
+}
+void PD_ConfigEnableProfile(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableProfile();
+}
+PD_Bool PD_ConfigProfileEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->profile_enabled();
+}
+void PD_ConfigDisableGlogInfo(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->DisableGlogInfo();
+}
+PD_Bool PD_ConfigGlogInfoDisabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->glog_info_disabled();
+}
+void PD_ConfigSetInvalid(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SetInValid();
+}
+PD_Bool PD_ConfigIsValid(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->is_valid();
+}
+void PD_ConfigEnableGpuMultiStream(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableGpuMultiStream();
+}
+void PD_ConfigPartiallyRelease(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->PartiallyRelease();
+}
+
+}  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h
new file mode 100644
index 00000000000000..e44983e24484ea
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -0,0 +1,571 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file pd_config.h
+///
+/// \brief interface for paddle config
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include "pd_common.h"  // NOLINT
+
+typedef struct PD_Config PD_Config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+/// \brief Create a paddle config
+///
+/// \return new config.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Config* PD_ConfigCreate();
+///
+/// \brief Destroy the paddle config
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDestroy(__pd_take PD_Config* pd_config);
+///
+/// \brief Set the combined model with two specific pathes for program and
+/// parameters.
+///
+/// \param[in] pd_onfig config
+/// \param[in] prog_file_path model file path of the combined model.
+/// \param[in] params_file_path params file path of the combined model.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetModel(__pd_keep PD_Config* pd_config,
+                                                 const char* prog_file_path,
+                                                 const char* params_file_path);
+///
+/// \brief Set the model file path of a combined model.
+///
+/// \param[in] pd_onfig config
+/// \param[in] prog_file_path model file path.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetProgFile(
+    __pd_keep PD_Config* pd_config, const char* prog_file_path);
+///
+/// \brief Set the params file path of a combined model.
+///
+/// \param[in] pd_onfig config
+/// \param[in] params_file_path params file path.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetParamsFile(
+    __pd_keep PD_Config* pd_config, const char* params_file_path);
+///
+/// \brief Set the path of optimization cache directory.
+/// \param[in] pd_onfig config
+/// \param[in] opt_cache_dir the path of optimization cache directory.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetOptimCacheDir(
+    __pd_keep PD_Config* pd_config, const char* opt_cache_dir);
+///
+/// \brief Set the no-combined model dir path.
+/// \param[in] pd_onfig config
+/// \param[in] model_dir model dir path.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetModelDir(
+    __pd_keep PD_Config* pd_config, const char* model_dir);
+///
+/// \brief Get the model directory path.
+///
+/// \param[in] pd_onfig config
+/// \return The model directory path.
+///
+PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetModelDir(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the program file path.
+///
+/// \param[in] pd_onfig config
+/// \return The program file path.
+///
+PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetProgFile(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the params file path.
+///
+/// \param[in] pd_onfig config
+/// \return The params file path.
+///
+PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetParamsFile(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn off FC Padding.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableFCPadding(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether fc padding is used.
+///
+/// \param[in] pd_onfig config
+/// \return Whether fc padding is used.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseFcPadding(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on GPU.
+///
+/// \param[in] pd_onfig config
+/// \param[in] memory_pool_init_size_mb initial size of the GPU memory pool in
+/// MB.
+/// \param[in] device_id device_id the GPU card to use.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableUseGpu(
+    __pd_keep PD_Config* pd_config, uint64_t memory_pool_init_size_mb,
+    int32_t device_id);
+///
+/// \brief Turn off GPU.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGpu(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the GPU is turned on.
+///
+/// \brief Turn off GPU.
+/// \return Whether the GPU is turned on.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on XPU.
+///
+/// \param[in] pd_onfig config
+/// \param[in] l3_workspace_size l3 workspace size.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
+    __pd_keep PD_Config* pd_config, int32_t l3_workspace_size);
+///
+/// \brief A boolean state telling whether the XPU is turned on.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the XPU is turned on.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseXpu(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the GPU device id.
+///
+/// \param[in] pd_onfig config
+/// \return The GPU device id.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGpuDeviceId(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the XPU device id.
+///
+/// \param[in] pd_onfig config
+/// \return The XPU device id.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigXpuDeviceId(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the initial size in MB of the GPU memory pool.
+///
+/// \param[in] pd_onfig config
+/// \return The initial size in MB of the GPU memory pool.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigMemoryPoolInitSizeMb(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the proportion of the initial memory pool size compared to the
+/// device.
+///
+/// \param[in] pd_onfig config
+/// \return The proportion of the initial memory pool size.
+///
+PADDLE_CAPI_EXPORT extern float PD_ConfigFractionOfGpuMemoryForPool(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on CUDNN.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableCudnn(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether to use CUDNN.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use CUDNN.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigCudnnEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Control whether to perform IR graph optimization.
+/// If turned off, the AnalysisConfig will act just like a NativeConfig.
+///
+/// \param[in] pd_onfig config
+/// \param[in] x Whether the ir graph optimization is actived.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSwitchIrOptim(
+    __pd_keep PD_Config* pd_config, PD_Bool x);
+///
+/// \brief A boolean state telling whether the ir graph optimization is
+/// actived.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use ir graph optimization.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIrOptim(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on the TensorRT engine.
+/// The TensorRT engine will accelerate some subgraphes in the original Fluid
+/// computation graph. In some models such as resnet50, GoogleNet and so on,
+/// it gains significant performance acceleration.
+///
+/// \param[in] pd_onfig config
+/// \param[in] workspace_size The memory size(in byte) used for TensorRT
+/// workspace.
+/// \param[in] max_batch_size The maximum batch size of this prediction task,
+/// better set as small as possible for less performance loss.
+/// \param[in] min_subgrpah_size The minimum TensorRT subgraph size needed, if a
+/// subgraph is smaller than this, it will not be transferred to TensorRT
+/// engine.
+/// \param[in] precision The precision used in TensorRT.
+/// \param[in] use_static Serialize optimization information to disk for
+/// reusing.
+/// \param[in] use_calib_mode Use TRT int8 calibration(post training
+/// quantization).
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtEngine(
+    __pd_keep PD_Config* pd_config, int32_t workspace_size,
+    int32_t max_batch_size, int32_t min_subgraph_size,
+    PD_PrecisionType precision, PD_Bool use_static, PD_Bool use_calib_mode);
+///
+/// \brief A boolean state telling whether the TensorRT engine is used.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the TensorRT engine is used.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtEngineEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Set min, max, opt shape for TensorRT Dynamic shape mode.
+///
+/// \param[in] pd_onfig config
+/// \param[in] tensor_num The number of the subgraph input.
+/// \param[in] tensor_name The name of every subgraph input.
+/// \param[in] shapes_num The shape size of every subgraph input.
+/// \param[in] min_shape The min input shape of every subgraph input.
+/// \param[in] max_shape The max input shape of every subgraph input.
+/// \param[in] optim_shape The opt input shape of every subgraph input.
+/// \param[in] disable_trt_plugin_fp16 Setting this parameter to true means that
+/// TRT plugin will not run fp16.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetTrtDynamicShapeInfo(
+    __pd_keep PD_Config* pd_config, size_t tensor_num, const char** tensor_name,
+    size_t* shapes_num, int32_t** min_shape, int32_t** max_shape,
+    int32_t** optim_shape, PD_Bool disable_trt_plugin_fp16);
+///
+/// \brief Prevent ops running in Paddle-TRT
+/// NOTE: just experimental, not an official stable API, easy to be broken.
+///
+/// \param[in] pd_onfig config
+/// \param[in] ops_num ops number
+/// \param[in] ops_name ops name
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableTensorRtOPs(
+    __pd_keep PD_Config* pd_config, size_t ops_num, const char** ops_name);
+///
+/// \brief Replace some TensorRT plugins to TensorRT OSS(
+/// https://github.com/NVIDIA/TensorRT), with which some models's inference
+/// may be more high-performance. Libnvinfer_plugin.so greater than
+/// V7.2.1 is needed.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtOSS(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether to use the TensorRT OSS.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the TensorRT OSS.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtOssEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Enable TensorRT DLA
+///
+/// \param[in] pd_onfig config
+/// \param[in] dla_core ID of DLACore, which should be 0, 1,
+///        ..., IBuilder.getNbDLACores() - 1
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtDla(
+    __pd_keep PD_Config* pd_config, int32_t dla_core);
+///
+/// \brief A boolean state telling whether to use the TensorRT DLA.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the TensorRT DLA.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtDlaEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on the usage of Lite sub-graph engine.
+///
+/// \param[in] pd_onfig config
+/// \param[in] precision Precion used in Lite sub-graph engine.
+/// \param[in] zero_copy whether use zero copy.
+/// \param[in] passes_filter_num The number of passes used in Lite sub-graph
+/// engine.
+/// \param[in] passes_filter The name of passes used in Lite sub-graph engine.
+/// \param[in] ops_filter_num The number of operators not supported by Lite.
+/// \param[in] ops_filter The name of operators not supported by Lite.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableLiteEngine(
+    __pd_keep PD_Config* pd_config, PD_PrecisionType precision,
+    PD_Bool zero_copy, size_t passes_filter_num, const char** passes_filter,
+    size_t ops_filter_num, const char** ops_filter);
+///
+/// \brief A boolean state indicating whether the Lite sub-graph engine is
+/// used.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the Lite sub-graph engine is used.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigLiteEngineEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Control whether to debug IR graph analysis phase.
+/// This will generate DOT files for visualizing the computation graph after
+/// each analysis pass applied.
+///
+/// \param[in] pd_onfig config
+/// \param[in] x whether to debug IR graph analysis phase.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSwitchIrDebug(
+    __pd_keep PD_Config* pd_config, PD_Bool x);
+///
+/// \brief Turn on MKLDNN.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMKLDNN(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Set the cache capacity of different input shapes for MKLDNN.
+/// Default value 0 means not caching any shape.
+/// Please see MKL-DNN Data Caching Design Document:
+/// https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/mkldnn/caching/caching.md
+///
+/// \param[in] pd_onfig config
+/// \param[in] capacity The cache capacity.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetMkldnnCacheCapacity(
+    __pd_keep PD_Config* pd_config, int32_t capacity);
+///
+/// \brief A boolean state telling whether to use the MKLDNN.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the MKLDNN.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Set the number of cpu math library threads.
+///
+/// \param[in] pd_onfig config
+/// \param cpu_math_library_num_threads The number of cpu math library
+/// threads.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetCpuMathLibraryNumThreads(
+    __pd_keep PD_Config* pd_config, int32_t cpu_math_library_num_threads);
+///
+/// \brief An int state telling how many threads are used in the CPU math
+/// library.
+///
+/// \param[in] pd_onfig config
+/// \return The number of threads used in the CPU math library.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGetCpuMathLibraryNumThreads(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Specify the operator type list to use MKLDNN acceleration.
+///
+/// \param[in] pd_onfig config
+/// \param[in] ops_num The number of operator type list.
+/// \param[in] op_list The name of operator type list.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetMkldnnOp(
+    __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list);
+///
+/// \brief Turn on MKLDNN quantization.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnQuantizer(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the MKLDNN quantization is enabled.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the MKLDNN quantization is enabled.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnQuantizerEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on MKLDNN bfloat16.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnBfloat16(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether to use the MKLDNN Bfloat16.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the MKLDNN Bfloat16.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnBfloat16Enabled(
+    __pd_keep PD_Config* pd_config);
+/// \brief Specify the operator type list to use Bfloat16 acceleration.
+///
+/// \param[in] pd_onfig config
+/// \param[in] ops_num The number of operator type list.
+/// \param[in] op_list The name of operator type list.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetBfloat16Op(
+    __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list);
+///
+/// \brief Enable the GPU multi-computing stream feature.
+/// NOTE: The current behavior of this interface is to bind the computation
+/// stream to the thread, and this behavior may be changed in the future.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableGpuMultiStream(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the thread local CUDA stream is
+/// enabled.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the thread local CUDA stream is enabled.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigThreadLocalStreamEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Specify the memory buffer of program and parameter.
+/// Used when model and params are loaded directly from memory.
+///
+/// \param[in] pd_onfig config
+/// \param[in] prog_buffer The memory buffer of program.
+/// \param[in] prog_buffer_size The size of the model data.
+/// \param[in] params_buffer The memory buffer of the combined parameters file.
+/// \param[in] params_buffer_size The size of the combined parameters data.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetModelBuffer(
+    __pd_keep PD_Config* pd_config, const char* prog_buffer,
+    size_t prog_buffer_size, const char* params_buffer,
+    size_t params_buffer_size);
+///
+/// \brief A boolean state telling whether the model is set from the CPU
+/// memory.
+///
+/// \param[in] pd_onfig config
+/// \return Whether model and params are loaded directly from memory.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigModelFromMemory(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on memory optimize
+/// NOTE still in development.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMemoryOptim(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the memory optimization is
+/// activated.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the memory optimization is activated.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMemoryOptimEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on profiling report.
+/// If not turned on, no profiling report will be generated.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableProfile(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the profiler is activated.
+///
+/// \param[in] pd_onfig config
+/// \return bool Whether the profiler is activated.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigProfileEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Mute all logs in Paddle inference.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGlogInfo(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether logs in Paddle inference are muted.
+///
+/// \param[in] pd_onfig config
+/// \return Whether logs in Paddle inference are muted.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigGlogInfoDisabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Set the Config to be invalid.
+/// This is to ensure that an Config can only be used in one
+/// Predictor.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetInvalid(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the Config is valid.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the Config is valid.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIsValid(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Partially release the memory
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigPartiallyRelease(
+    __pd_keep PD_Config* pd_config);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/paddle/fluid/inference/capi_exp/pd_inference_api.h b/paddle/fluid/inference/capi_exp/pd_inference_api.h
new file mode 100644
index 00000000000000..5f21dca1a7bf6a
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_inference_api.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pd_common.h"     // NOLINT
+#include "pd_config.h"     // NOLINT
+#include "pd_predictor.h"  // NOLINT
+#include "pd_tensor.h"     // NOLINT
+#include "pd_types.h"      // NOLINT
+#include "pd_utils.h"      // NOLINT
diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.cc b/paddle/fluid/inference/capi_exp/pd_predictor.cc
new file mode 100644
index 00000000000000..f5287a5152957f
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/capi_exp/pd_predictor.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_types.h"
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
+#include "paddle/fluid/inference/capi_exp/types_internal.h"
+#include "paddle/fluid/inference/capi_exp/utils_internal.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define CHECK_AND_CONVERT_PD_PREDICTOR                              \
+  PADDLE_ENFORCE_NOT_NULL(                                          \
+      pd_predictor,                                                 \
+      paddle::platform::errors::InvalidArgument(                    \
+          "The pointer of paddle predictor shouldn't be nullptr")); \
+  auto& predictor = pd_predictor->predictor
+
+extern "C" {
+__pd_give PD_Predictor* PD_PredictorCreate(__pd_take PD_Config* pd_config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      pd_config, paddle::platform::errors::InvalidArgument(
+                     "The pointer of paddle predictor shouldn't be nullptr"));
+  PD_Predictor* pd_predictor = new PD_Predictor();
+  paddle_infer::Config* config =
+      reinterpret_cast<paddle_infer::Config*>(pd_config);
+  pd_predictor->predictor = paddle_infer::CreatePredictor(*config);
+  delete config;
+  return pd_predictor;
+}
+
+__pd_give PD_Predictor* PD_PredictorClone(
+    __pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  PD_Predictor* new_predictor = new PD_Predictor();
+  new_predictor->predictor = predictor->Clone();
+  return new_predictor;
+}
+
+__pd_give PD_OneDimArrayCstr* PD_PredictorGetInputNames(
+    __pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  std::vector<std::string> names = predictor->GetInputNames();
+  return paddle_infer::CvtVecToOneDimArrayCstr(names);
+}
+
+__pd_give PD_OneDimArrayCstr* PD_PredictorGetOutputNames(
+    __pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  std::vector<std::string> names = predictor->GetOutputNames();
+  return paddle_infer::CvtVecToOneDimArrayCstr(names);
+}
+
+size_t PD_PredictorGetInputNum(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  return predictor->GetInputNames().size();
+}
+
+size_t PD_PredictorGetOutputNum(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  return predictor->GetOutputNames().size();
+}
+__pd_give PD_Tensor* PD_PredictorGetInputHandle(
+    __pd_keep PD_Predictor* pd_predictor, const char* name) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  PD_Tensor* pd_tensor = new PD_Tensor();
+  pd_tensor->tensor = predictor->GetInputHandle(name);
+  return pd_tensor;
+}
+
+__pd_give PD_Tensor* PD_PredictorGetOutputHandle(
+    __pd_keep PD_Predictor* pd_predictor, const char* name) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  PD_Tensor* pd_tensor = new PD_Tensor();
+  pd_tensor->tensor = predictor->GetOutputHandle(name);
+  return pd_tensor;
+}
+
+PD_Bool PD_PredictorRun(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  return predictor->Run();
+}
+
+void PD_PredictorClearIntermediateTensor(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  predictor->ClearIntermediateTensor();
+}
+
+uint64_t PD_PredictorTryShrinkMemory(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  return predictor->TryShrinkMemory();
+}
+
+void PD_PredictorDestroy(__pd_take PD_Predictor* pd_predictor) {
+  delete pd_predictor;
+}
+
+}  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.h b/paddle/fluid/inference/capi_exp/pd_predictor.h
new file mode 100644
index 00000000000000..d4542d0b6d394d
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.h
@@ -0,0 +1,148 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file pd_predictor.h
+///
+/// \brief interface for paddle predictor
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include "pd_common.h"  // NOLINT
+
+typedef struct PD_Predictor PD_Predictor;
+typedef struct PD_Config PD_Config;
+typedef struct PD_Tensor PD_Tensor;
+typedef struct PD_OneDimArrayCstr PD_OneDimArrayCstr;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+/// \brief Create a new Predictor
+///
+/// \param[in] Config config
+/// \return new predicor.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Predictor* PD_PredictorCreate(
+    __pd_take PD_Config* pd_config);
+///
+/// \brief Clone a new Predictor
+///
+/// \param[in] pd_predictor predictor
+/// \return new predictor.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Predictor* PD_PredictorClone(
+    __pd_keep PD_Predictor* pd_predictor);
+///
+/// \brief Get the input names
+///
+/// \param[in] pd_predictor predictor
+/// \return input names
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr*
+PD_PredictorGetInputNames(__pd_keep PD_Predictor* pd_predictor);
+///
+/// \brief Get the output names
+///
+/// \param[in] pd_predictor predictor
+/// \return output names
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr*
+PD_PredictorGetOutputNames(__pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Get the input number
+///
+/// \param[in] pd_predictor predictor
+/// \return input number
+///
+PADDLE_CAPI_EXPORT extern size_t PD_PredictorGetInputNum(
+    __pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Get the output number
+///
+/// \param[in] pd_predictor predictor
+/// \return output number
+///
+PADDLE_CAPI_EXPORT extern size_t PD_PredictorGetOutputNum(
+    __pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Get the Input Tensor object
+///
+/// \param[in] pd_predictor predictor
+/// \param[in] name input name
+/// \return input tensor
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Tensor* PD_PredictorGetInputHandle(
+    __pd_keep PD_Predictor* pd_predictor, const char* name);
+
+///
+/// \brief Get the Output Tensor object
+///
+/// \param[in] pd_predictor predictor
+/// \param[in] name output name
+/// \return output tensor
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Tensor* PD_PredictorGetOutputHandle(
+    __pd_keep PD_Predictor* pd_predictor, const char* name);
+
+///
+/// \brief Run the prediction engine
+///
+/// \param[in] pd_predictor predictor
+/// \return Whether the function executed successfully
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_PredictorRun(
+    __pd_keep PD_Predictor* pd_predictor);
+
+/// \brief Clear the intermediate tensors of the predictor
+///
+/// \param[in] pd_predictor predictor
+///
+PADDLE_CAPI_EXPORT extern void PD_PredictorClearIntermediateTensor(
+    __pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Release all tmp tensor to compress the size of the memory pool.
+/// The memory pool is considered to be composed of a list of chunks, if
+/// the chunk is not occupied, it can be released.
+///
+/// \param[in] pd_predictor predictor
+/// \return Number of bytes released. It may be smaller than the actual
+/// released memory, because part of the memory is not managed by the
+/// MemoryPool.
+///
+PADDLE_CAPI_EXPORT extern uint64_t PD_PredictorTryShrinkMemory(
+    __pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Destroy a predictor object
+///
+/// \param[in] pd_predictor predictor
+///
+PADDLE_CAPI_EXPORT extern void PD_PredictorDestroy(
+    __pd_take PD_Predictor* pd_predictor);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.cc b/paddle/fluid/inference/capi_exp/pd_tensor.cc
new file mode 100644
index 00000000000000..9c661dea6f2bb2
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_tensor.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/capi_exp/pd_tensor.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_types.h"
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
+#include "paddle/fluid/inference/capi_exp/types_internal.h"
+#include "paddle/fluid/inference/capi_exp/utils_internal.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define CHECK_AND_CONVERT_PD_TENSOR                                         \
+  PADDLE_ENFORCE_NOT_NULL(                                                  \
+      pd_tensor, paddle::platform::errors::InvalidArgument(                 \
+                     "The pointer of paddle tensor shouldn't be nullptr")); \
+  auto& tensor = pd_tensor->tensor
+
+extern "C" {
+
+void PD_TensorDestroy(__pd_take PD_Tensor* pd_tensor) { delete pd_tensor; }
+void PD_TensorReshape(__pd_keep PD_Tensor* pd_tensor, size_t shape_size,
+                      int32_t* shape) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  std::vector<int> shapes(shape_size);
+  for (size_t index = 0; index < shape_size; ++index) {
+    shapes[index] = shape[index];
+  }
+  tensor->Reshape(shapes);
+}
+
+#define REPEAT_ALL_DATA_TYPE(func)                             \
+  func(float, Float) func(int64_t, Int64) func(int32_t, Int32) \
+      func(uint8_t, Uint8) func(int8_t, Int8)
+
+#define PD_TENSOR_MUTABLE_DATA_IMPL(type, Type)                                \
+  type* PD_TensorMutableData##Type(__pd_keep PD_Tensor* pd_tensor,             \
+                                   PD_PlaceType place) {                       \
+    CHECK_AND_CONVERT_PD_TENSOR;                                               \
+    return tensor->mutable_data<type>(paddle_infer::CvtToCxxPlaceType(place)); \
+  }
+REPEAT_ALL_DATA_TYPE(PD_TENSOR_MUTABLE_DATA_IMPL)
+#undef PD_TENSOR_MUTABLE_DATA_IMPL
+
+#define PD_TENSOR_DATA_IMPL(type, Type)                                        \
+  type* PD_TensorData##Type(__pd_keep PD_Tensor* pd_tensor,                    \
+                            PD_PlaceType* place, int32_t* size) {              \
+    CHECK_AND_CONVERT_PD_TENSOR;                                               \
+    PADDLE_ENFORCE_NOT_NULL(place,                                             \
+                            paddle::platform::errors::InvalidArgument(         \
+                                "The pointer of place shouldn't be nullptr")); \
+    PADDLE_ENFORCE_NOT_NULL(size,                                              \
+                            paddle::platform::errors::InvalidArgument(         \
+                                "The pointer of size shouldn't be nullptr"));  \
+    paddle_infer::PlaceType cxx_palce_type;                                    \
+    int cxx_size;                                                              \
+    type* data = tensor->data<type>(&cxx_palce_type, &cxx_size);               \
+    *place = paddle_infer::CvtFromCxxPlaceType(cxx_palce_type);                \
+    *size = static_cast<int32_t>(cxx_size);                                    \
+    return data;                                                               \
+  }
+REPEAT_ALL_DATA_TYPE(PD_TENSOR_DATA_IMPL)
+#undef PD_TENSOR_DATA_IMPL
+
+#define PD_TENSOR_COPY_FROM_CPU_IMPL(type, Type)                  \
+  void PD_TensorCopyFromCpu##Type(__pd_keep PD_Tensor* pd_tensor, \
+                                  const type* data) {             \
+    CHECK_AND_CONVERT_PD_TENSOR;                                  \
+    tensor->CopyFromCpu<type>(data);                              \
+  }
+REPEAT_ALL_DATA_TYPE(PD_TENSOR_COPY_FROM_CPU_IMPL)
+#undef PD_TENSOR_COPY_FROM_CPU_IMPL
+
+#define PD_TENSOR_COPY_TO_CPU_IMPL(type, Type)                                \
+  void PD_TensorCopyToCpu##Type(__pd_keep PD_Tensor* pd_tensor, type* data) { \
+    CHECK_AND_CONVERT_PD_TENSOR;                                              \
+    tensor->CopyToCpu<type>(data);                                            \
+  }
+REPEAT_ALL_DATA_TYPE(PD_TENSOR_COPY_TO_CPU_IMPL)
+#undef PD_TENSOR_COPY_TO_CPU_IMPL
+
+#undef REPEAT_ALL_DATA_TYPE
+
+__pd_give PD_OneDimArrayInt32* PD_TensorGetShape(
+    __pd_keep PD_Tensor* pd_tensor) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  return paddle_infer::CvtVecToOneDimArrayInt32(tensor->shape());
+}
+void PD_TensorSetLod(__pd_keep PD_Tensor* pd_tensor,
+                     __pd_keep PD_TwoDimArraySize* lod) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  tensor->SetLoD(paddle_infer::CvtTwoDimArrayToVecSize(lod));
+}
+__pd_give PD_TwoDimArraySize* PD_TensorGetLod(__pd_keep PD_Tensor* pd_tensor) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  return paddle_infer::CvtVecToTwoDimArraySize(tensor->lod());
+}
+const char* PD_TensorGetName(__pd_keep PD_Tensor* pd_tensor) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  return tensor->name().c_str();
+}
+PD_DataType PD_TensorGetDataType(__pd_keep PD_Tensor* pd_tensor) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  return paddle_infer::CvtFromCxxDatatype(tensor->type());
+}
+
+}  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.h b/paddle/fluid/inference/capi_exp/pd_tensor.h
new file mode 100644
index 00000000000000..29ea4b5d62e43c
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_tensor.h
@@ -0,0 +1,287 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file pd_tensor.h
+///
+/// \brief interface for paddle tensor
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include "pd_common.h"  // NOLINT
+
+typedef struct PD_Tensor PD_Tensor;
+typedef struct PD_OneDimArrayInt32 PD_OneDimArrayInt32;
+typedef struct PD_TwoDimArraySize PD_TwoDimArraySize;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+/// \brief Destroy the paddle tensor
+///
+/// \param[in] pd_tensor tensor
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorDestroy(__pd_take PD_Tensor* pd_tensor);
+
+///
+/// \brief Reset the shape of the tensor.
+/// Generally it's only used for the input tensor.
+/// Reshape must be called before calling PD_TensorMutableData*() or
+/// PD_TensorCopyFromCpu*()
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] shape_size The size of shape.
+/// \param[in] shape The shape to set.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorReshape(__pd_keep PD_Tensor* pd_tensor,
+                                                size_t shape_size,
+                                                int32_t* shape);
+
+///
+/// \brief Get the memory pointer in CPU or GPU with 'float' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern float* PD_TensorMutableDataFloat(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer in CPU or GPU with 'int64_t' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern int64_t* PD_TensorMutableDataInt64(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer in CPU or GPU with 'int32_t' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern int32_t* PD_TensorMutableDataInt32(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer in CPU or GPU with 'uint8_t' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern uint8_t* PD_TensorMutableDataUint8(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer in CPU or GPU with 'int8_t' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern int8_t* PD_TensorMutableDataInt8(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern float* PD_TensorDataFloat(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern int64_t* PD_TensorDataInt64(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern int32_t* PD_TensorDataInt32(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern uint8_t* PD_TensorDataUint8(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern int8_t* PD_TensorDataInt8(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuFloat(
+    __pd_keep PD_Tensor* pd_tensor, const float* data);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt64(
+    __pd_keep PD_Tensor* pd_tensor, const int64_t* data);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt32(
+    __pd_keep PD_Tensor* pd_tensor, const int32_t* data);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuUint8(
+    __pd_keep PD_Tensor* pd_tensor, const uint8_t* data);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt8(
+    __pd_keep PD_Tensor* pd_tensor, const int8_t* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuFloat(
+    __pd_keep PD_Tensor* pd_tensor, float* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt64(
+    __pd_keep PD_Tensor* pd_tensor, int64_t* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt32(
+    __pd_keep PD_Tensor* pd_tensor, int32_t* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuUint8(
+    __pd_keep PD_Tensor* pd_tensor, uint8_t* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt8(
+    __pd_keep PD_Tensor* pd_tensor, int8_t* data);
+///
+/// \brief Get the tensor shape
+/// \param[in] pd_tensor tensor.
+/// \return The tensor shape.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayInt32* PD_TensorGetShape(
+    __pd_keep PD_Tensor* pd_tensor);
+
+///
+/// \brief Set the tensor lod information
+/// \param[in] pd_tensor tensor.
+/// \param[in] lod lod information.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorSetLod(
+    __pd_keep PD_Tensor* pd_tensor, __pd_keep PD_TwoDimArraySize* lod);
+///
+/// \brief Get the tensor lod information
+/// \param[in] pd_tensor tensor.
+/// \return the lod information.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_TwoDimArraySize* PD_TensorGetLod(
+    __pd_keep PD_Tensor* pd_tensor);
+///
+/// \brief Get the tensor name
+/// \param[in] pd_tensor tensor.
+/// \return the tensor name.
+///
+PADDLE_CAPI_EXPORT extern const char* PD_TensorGetName(
+    __pd_keep PD_Tensor* pd_tensor);
+///
+/// \brief Get the tensor data type
+/// \param[in] pd_tensor tensor.
+/// \return the tensor data type.
+///
+PADDLE_CAPI_EXPORT extern PD_DataType PD_TensorGetDataType(
+    __pd_keep PD_Tensor* pd_tensor);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/paddle/fluid/inference/capi_exp/pd_types.h b/paddle/fluid/inference/capi_exp/pd_types.h
new file mode 100644
index 00000000000000..a5da2913a9b207
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_types.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pd_common.h"  // NOLINT
+
+typedef struct PD_OneDimArrayInt32 {
+  size_t size;
+  int32_t* data;
+} PD_OneDimArrayInt32;  // std::vector<int32_t>
+
+typedef struct PD_OneDimArraySize {
+  size_t size;
+  size_t* data;
+} PD_OneDimArraySize;  // std::vector<size_t>
+
+typedef struct PD_OneDimArrayCstr {
+  size_t size;
+  char** data;
+} PD_OneDimArrayCstr;  // std::vector<std::string>
+
+typedef struct PD_TwoDimArraySize {
+  size_t size;
+  PD_OneDimArraySize** data;
+} PD_TwoDimArraySize;  // std::vector<std::vector<size_t>>
diff --git a/paddle/fluid/inference/capi_exp/pd_utils.cc b/paddle/fluid/inference/capi_exp/pd_utils.cc
new file mode 100644
index 00000000000000..2e762619f5567c
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_utils.cc
@@ -0,0 +1,221 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
+#include "paddle/fluid/inference/capi_exp/utils_internal.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define DESTROY_ONE_DIM_ARRAY(type)                                           \
+  void PD_OneDimArray##type##Destroy(__pd_take PD_OneDimArray##type* array) { \
+    if (array != NULL) {                                                      \
+      delete[] array->data;                                                   \
+      delete array;                                                           \
+    }                                                                         \
+  }
+#define CONVERT_VEC_TO_ONE_DIM_ARRAY(type, Type, vec_type)   \
+  __pd_give PD_OneDimArray##Type* CvtVecToOneDimArray##Type( \
+      const std::vector<vec_type>& vec) {                    \
+    PD_OneDimArray##Type* array = new PD_OneDimArray##Type;  \
+    array->size = vec.size();                                \
+    array->data = vec.empty() ? NULL : new type[vec.size()]; \
+    for (size_t index = 0; index < vec.size(); ++index) {    \
+      array->data[index] = vec[index];                       \
+    }                                                        \
+    return array;                                            \
+  }
+#define CONVERT_ONE_DIM_ARRAY_TO_VEC(type, Type, vec_type)   \
+  std::vector<vec_type> CvtOneDimArrayToVec##Type(           \
+      __pd_keep const PD_OneDimArray##Type* array) {         \
+    std::vector<vec_type> vec;                               \
+    if (array != NULL) {                                     \
+      vec.resize(array->size);                               \
+      for (size_t index = 0; index < array->size; ++index) { \
+        vec[index] = array->data[index];                     \
+      }                                                      \
+    }                                                        \
+    return vec;                                              \
+  }
+
+#define ONE_DIM_ARRAY_UTILS_FUNC_IMPL(type, Type, vec_type) \
+  extern "C" {                                              \
+  DESTROY_ONE_DIM_ARRAY(Type);                              \
+  }                                                         \
+  namespace paddle_infer {                                  \
+  CONVERT_VEC_TO_ONE_DIM_ARRAY(type, Type, vec_type)        \
+  CONVERT_ONE_DIM_ARRAY_TO_VEC(type, Type, vec_type)        \
+  }
+
+ONE_DIM_ARRAY_UTILS_FUNC_IMPL(int32_t, Int32, int)
+ONE_DIM_ARRAY_UTILS_FUNC_IMPL(size_t, Size, size_t)
+
+#undef ONE_DIM_ARRAY_UTILS_FUNC_IMPL
+#undef CONVERT_ONE_DIM_ARRAY_TO_VEC
+#undef CONVERT_VEC_TO_ONE_DIM_ARRAY
+#undef DESTROY_ONE_DIM_ARRAY
+
+void PD_OneDimArrayCstrDestroy(__pd_take PD_OneDimArrayCstr* array) {
+  if (array != NULL) {
+    if (array->size != 0) {
+      for (size_t index = 0; index < array->size; ++index) {
+        delete[] array->data[index];
+      }
+    }
+    delete[] array->data;
+    delete array;
+  }
+}
+namespace paddle_infer {
+
+__pd_give PD_OneDimArrayCstr* CvtVecToOneDimArrayCstr(
+    const std::vector<std::string>& vec) {
+  PD_OneDimArrayCstr* array = new PD_OneDimArrayCstr;
+  array->size = vec.size();
+  array->data = vec.empty() ? NULL : new char*[vec.size()];
+  for (size_t index = 0u; index < vec.size(); ++index) {
+    array->data[index] = new char[vec[index].size() + 1];
+    memcpy(array->data[index], vec[index].c_str(), vec[index].size() + 1);
+  }
+  return array;
+}
+
+std::vector<std::string> CvtOneDimArrayToVecCstr(
+    __pd_keep const PD_OneDimArrayCstr* array) {
+  std::vector<std::string> vec;
+  for (size_t index = 0; index < array->size; ++index) {
+    vec.emplace_back(array->data[index]);
+  }
+  return vec;
+}
+
+}  // namespace paddle_infer
+
+#define DESTROY_TWO_DIM_ARRAY(type)                                           \
+  void PD_TwoDimArray##type##Destroy(__pd_take PD_TwoDimArray##type* array) { \
+    if (array != NULL) {                                                      \
+      if (array->size != 0) {                                                 \
+        for (size_t index = 0; index < array->size; ++index) {                \
+          PD_OneDimArray##type##Destroy(array->data[index]);                  \
+        }                                                                     \
+      }                                                                       \
+      delete[] array->data;                                                   \
+      delete array;                                                           \
+    }                                                                         \
+  }
+#define CONVERT_VEC_TO_TWO_DIM_ARRAY(type, Type, vec_type)                    \
+  __pd_give PD_TwoDimArray##Type* CvtVecToTwoDimArray##Type(                  \
+      const std::vector<std::vector<vec_type>>& vec) {                        \
+    PD_TwoDimArray##Type* array = new PD_TwoDimArray##Type;                   \
+    array->size = vec.size();                                                 \
+    array->data = vec.empty() ? NULL : new PD_OneDimArray##Type*[vec.size()]; \
+    for (size_t index = 0; index < vec.size(); ++index) {                     \
+      array->data[index] = CvtVecToOneDimArray##Type(vec[index]);             \
+    }                                                                         \
+    return array;                                                             \
+  }
+#define CONVERT_TWO_DIM_ARRAY_TO_VEC(type, Type, vec_type)            \
+  std::vector<std::vector<vec_type>> CvtTwoDimArrayToVec##Type(       \
+      __pd_keep const PD_TwoDimArray##Type* array) {                  \
+    std::vector<std::vector<vec_type>> vec;                           \
+    if (array != NULL && array->size != 0) {                          \
+      vec.resize(array->size);                                        \
+      for (size_t index = 0; index < array->size; ++index) {          \
+        vec[index] = CvtOneDimArrayToVec##Type((array->data)[index]); \
+      }                                                               \
+    }                                                                 \
+    return vec;                                                       \
+  }
+#define TWO_DIM_ARRAY_UTILS_FUNC_IMPL(type, Type, vec_type) \
+  extern "C" {                                              \
+  DESTROY_TWO_DIM_ARRAY(Type);                              \
+  }                                                         \
+  namespace paddle_infer {                                  \
+  CONVERT_VEC_TO_TWO_DIM_ARRAY(type, Type, vec_type)        \
+  CONVERT_TWO_DIM_ARRAY_TO_VEC(type, Type, vec_type)        \
+  }
+
+TWO_DIM_ARRAY_UTILS_FUNC_IMPL(size_t, Size, size_t)
+
+#undef TWO_DIM_ARRAY_UTILS_FUNC_IMPL
+#undef CONVERT_TWO_DIM_ARRAY_TO_VEC
+#undef CONVERT_VEC_TO_TWO_DIM_ARRAY
+#undef DESTROY_TWO_DIM_ARRAY
+
+namespace paddle_infer {
+
+PlaceType CvtToCxxPlaceType(PD_PlaceType place_type) {
+  switch (place_type) {
+    case PD_PLACE_UNK:
+      return PlaceType::kUNK;
+    case PD_PLACE_CPU:
+      return PlaceType::kCPU;
+    case PD_PLACE_GPU:
+      return PlaceType::kGPU;
+    case PD_PLACE_XPU:
+      return PlaceType::kXPU;
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unsupport paddle place type %d.", place_type));
+      return PlaceType::kUNK;
+  }
+}
+
+PD_PlaceType CvtFromCxxPlaceType(PlaceType place_type) {
+  switch (place_type) {
+    case PlaceType::kCPU:
+      return PD_PLACE_CPU;
+    case PlaceType::kGPU:
+      return PD_PLACE_GPU;
+    case PlaceType::kXPU:
+      return PD_PLACE_XPU;
+    default:
+      return PD_PLACE_UNK;
+  }
+}
+
+DataType CvtToCxxDatatype(PD_DataType data_type) {
+  switch (data_type) {
+    case PD_DATA_FLOAT32:
+      return DataType::FLOAT32;
+    case PD_DATA_INT64:
+      return DataType::INT64;
+    case PD_DATA_INT32:
+      return DataType::INT32;
+    case PD_DATA_UINT8:
+      return DataType::UINT8;
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unsupport paddle data type %d.", data_type));
+      return DataType::FLOAT32;
+  }
+}
+
+PD_DataType CvtFromCxxDatatype(DataType data_type) {
+  switch (data_type) {
+    case DataType::FLOAT32:
+      return PD_DATA_FLOAT32;
+    case DataType::INT64:
+      return PD_DATA_INT64;
+    case DataType::INT32:
+      return PD_DATA_INT32;
+    case DataType::UINT8:
+      return PD_DATA_UINT8;
+    default:
+      return PD_DATA_UNK;
+  }
+}
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/capi_exp/pd_utils.h b/paddle/fluid/inference/capi_exp/pd_utils.h
new file mode 100644
index 00000000000000..68e519d4bb5e95
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_utils.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file pd_utils.h
+///
+/// \brief Some utility function to destroy paddle struct.
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pd_types.h"  // NOLINT
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+/// \brief Destroy the PD_OneDimArrayInt32 object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_OneDimArrayInt32 object.
+///
+PADDLE_CAPI_EXPORT extern void PD_OneDimArrayInt32Destroy(
+    __pd_take PD_OneDimArrayInt32* array);
+
+///
+/// \brief Destroy the PD_OneDimArrayCstr object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_OneDimArrayCstr object.
+///
+PADDLE_CAPI_EXPORT extern void PD_OneDimArrayCstrDestroy(
+    __pd_take PD_OneDimArrayCstr* array);
+
+///
+/// \brief Destroy the PD_OneDimArraySize object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_OneDimArraySize object.
+///
+PADDLE_CAPI_EXPORT extern void PD_OneDimArraySizeDestroy(
+    __pd_take PD_OneDimArraySize* array);
+
+///
+/// \brief Destroy the PD_TwoDimArraySize object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_TwoDimArraySize object.
+///
+PADDLE_CAPI_EXPORT extern void PD_TwoDimArraySizeDestroy(
+    __pd_take PD_TwoDimArraySize* array);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/paddle/fluid/inference/capi_exp/types_internal.h b/paddle/fluid/inference/capi_exp/types_internal.h
new file mode 100644
index 00000000000000..8a61b9a884c3bf
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/types_internal.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstdio>
+
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_common.h"
+
+typedef struct PD_Tensor {
+  std::unique_ptr<paddle_infer::Tensor> tensor;
+} PD_Tensor;
+
+typedef struct PD_Predictor {
+  std::shared_ptr<paddle_infer::Predictor> predictor;
+} PD_Predictor;
diff --git a/paddle/fluid/inference/capi_exp/utils_internal.h b/paddle/fluid/inference/capi_exp/utils_internal.h
new file mode 100644
index 00000000000000..fbae512ecd8557
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/utils_internal.h
@@ -0,0 +1,153 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file utils_internal.h
+///
+/// \brief Some utility function used to convert object between C Struct and C++
+/// Class.
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include <cstdint>
+#include <cstdio>
+#include <vector>
+
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_types.h"
+
+namespace paddle_infer {
+
+///
+/// \brief Convert the 'std::vector<int>' object to a 'PD_OneDimArrayInt32'
+/// object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_OneDimArrayInt32* CvtVecToOneDimArrayInt32(
+    const std::vector<int>& vec);
+
+///
+/// \brief Convert the 'PD_OneDimArrayInt32' object to a 'std::vector<int>'
+/// object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<int> CvtOneDimArrayToVecInt32(
+    __pd_keep const PD_OneDimArrayInt32* array);
+
+///
+/// \brief Convert the 'std::vector<size_t>' object to a 'PD_OneDimArraySize'
+/// object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_OneDimArraySize* CvtVecToOneDimArraySize(
+    const std::vector<size_t>& vec);
+
+///
+/// \brief Convert the 'PD_OneDimArraySize' object to a 'std::vector<size_t>'
+/// object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<size_t> CvtOneDimArrayToVecSize(
+    __pd_keep const PD_OneDimArraySize* array);
+
+///
+/// \brief Convert the 'std::vector<std::string>' object to a
+/// 'PD_OneDimArrayCstr' object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_OneDimArrayCstr* CvtVecToOneDimArrayCstr(
+    const std::vector<std::string>& vec);
+
+///
+/// \brief Convert the 'PD_OneDimArrayCstr' object to a
+/// 'std::vector<std::string>' object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<std::string> CvtOneDimArrayToVecCstr(
+    __pd_keep const PD_OneDimArrayCstr* array);
+
+///
+/// \brief Convert the 'std::vector<std::vector<size_t>>' object to a
+/// 'PD_TwoDimArraySize' object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_TwoDimArraySize* CvtVecToTwoDimArraySize(
+    const std::vector<std::vector<size_t>>& vec);
+
+///
+/// \brief Convert the 'PD_TwoDimArraySize' object to a
+/// 'std::vector<std::vector<size_t>>' object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<std::vector<size_t>> CvtTwoDimArrayToVecSize(
+    __pd_keep const PD_TwoDimArraySize* array);
+
+///
+/// \brief Convert the 'PD_PlaceType' object to a 'paddle_infer::PlaceType'
+/// object.
+///
+/// \param[in] place_type source object.
+/// \return target object.
+///
+PlaceType CvtToCxxPlaceType(PD_PlaceType place_type);
+
+///
+/// \brief Convert the 'paddle_infer::PlaceType' object to a 'PD_PlaceType'
+/// object.
+///
+/// \param[in] place_type source object.
+/// \return target object.
+///
+PD_PlaceType CvtFromCxxPlaceType(PlaceType place_type);
+
+///
+/// \brief Convert the 'PD_DataType' object to a 'paddle_infer::DataType'
+/// object.
+///
+/// \param[in] place_type source object.
+/// \return target object.
+///
+DataType CvtToCxxDatatype(PD_DataType data_type);
+
+///
+/// \brief Convert the 'paddle_infer::DataType' object to a 'PD_DataType'
+/// object.
+///
+/// \param[in] place_type source object.
+/// \return target object.
+///
+PD_DataType CvtFromCxxDatatype(DataType data_type);
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index 59a786e46c98bf..908e1ab990bb73 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -59,8 +59,14 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
 #endif
 
 #ifdef LITE_SUBGRAPH_WITH_XPU
+  // Deprecated in Paddle-Lite release/v2.8
   lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
       cfg.xpu_l3_workspace_size);
+  lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_workspace_size,
+                                          cfg.locked);
+  lite_cxx_config.set_xpu_conv_autotune(cfg.autotune, cfg.autotune_file);
+  lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision,
+                                               cfg.adaptive_seqlen);
 #endif
 
   // create predictor
diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h
index 5ba487cc24d7d5..a64ef1eda828bf 100644
--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
@@ -42,6 +42,11 @@ struct EngineConfig {
 
   // for xpu
   size_t xpu_l3_workspace_size;
+  bool locked = false;
+  bool autotune = true;
+  std::string autotune_file = "";
+  std::string precision = "int16";
+  bool adaptive_seqlen = false;
 
   // for x86 or arm
   int cpu_math_library_num_threads{1};
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index a6484a13557052..7ea41839cb939f 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -38,38 +38,6 @@ class BatchNormOpConverter : public OpConverter {
     VLOG(3) << "convert a fluid batch norm op to tensorrt batch_norm";
 
     framework::OpDesc op_desc(op, nullptr);
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid input X's size of batch_norm TRT converter. "
-                          "Expected 1, received %d.",
-                          op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid input Bias's size of batch_norm TRT "
-                          "converter. Expected 1, received %d.",
-                          op_desc.Input("Bias").size()));  // Bias is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Input("Mean").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid input Mean's size of batch_norm TRT "
-                          "converter. Expected 1, received %d.",
-                          op_desc.Input("Mean").size()));  // Mean is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Input("Scale").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid input Scale's size of batch_norm TRT "
-                          "converter. Expected 1, received %d.",
-                          op_desc.Input("Scale").size()));  // Scale is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("Variance").size(), 1,
-        platform::errors::InvalidArgument(
-            "Invalid input Variance's size of batch_norm TRT converter. "
-            "Expected 1, received %d.",
-            op_desc.Input("Variance").size()));  // Variance is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid output Y's size of batch_norm TRT "
-                          "converter. Expected 1, received %d.",
-                          op_desc.Output("Y").size()));
-
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     // Declare weights
     auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front());
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 5515cd35daedc7..61199724bcfe30 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -36,18 +36,6 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
   VLOG(3) << "convert a fluid " << name << " op to tensorrt layer without bias";
 
   framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL,
-                    platform::errors::InvalidArgument(
-                        "TRT Conv2d expect 1 input, but got %d input.",
-                        op_desc.Input("Input").size()));
-  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL,
-                    platform::errors::InvalidArgument(
-                        "TRT Conv2d expect 1 filter, but got %d filter.",
-                        op_desc.Input("Filter").size()));
-  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL,
-                    platform::errors::InvalidArgument(
-                        "TRT Conv2d expect 1 output, but got %d output.",
-                        op_desc.Output("Output").size()));
 
   auto* X = engine->GetITensor(op_desc.Input("Input").front());
   std::string filter_var_name = op_desc.Input("Filter").front();
@@ -61,13 +49,6 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
 
   if (enable_int8) {
 #if IS_TRT_VERSION_GE(5000)
-    if (op_desc.Type() != "conv2d_transpose") {
-      PADDLE_ENFORCE_EQ(
-          op_desc.HasAttr("Input_scale"), true,
-          platform::errors::InvalidArgument("Input scale not found. TRT int8"
-                                            " requires conv/deconv to have "
-                                            "input quantization scales."));
-    }
     float in_scale =
         BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127;
     auto weight_scale =
@@ -179,19 +160,11 @@ class Deconv2dOpConverter : public OpConverter {
             nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
             TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* {
           auto* layer =
-              TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_input,
+              TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_output,
                                    ksize, weight.get(), bias.get());
           return layer;
         },
         [](nvinfer1::IDeconvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
-          // In trt Deconv, dilation should be 1, ohter values are not
-          // supported.
-          bool condition = (dilations.d[0] == 1 && dilations.d[1] == 1);
-          PADDLE_ENFORCE_EQ(condition, true,
-                            platform::errors::InvalidArgument(
-                                "In Deconv, Dilations must be (1, 1) for "
-                                "tensorRT, but given (%d, %d)",
-                                dilations.d[0], dilations.d[1]));
         },
         "conv2d_transpose");
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 74057addecd1f9..19d79510547ecc 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -25,6 +25,10 @@ static bool CheckDims(const nvinfer1::Dims& dims_x,
     return false;
   }
   for (int i = 0; i < dims_x.nbDims; i++) {
+    // conservative judgment
+    if (dims_x.d[i] == -1 || dims_y.d[i] == -1) {
+      return false;
+    }
     if (dims_x.d[i] != dims_y.d[i]) {
       return false;
     }
@@ -43,25 +47,6 @@ class ElementwiseWeightOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
 
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("X").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Input(\"X\").size() "
-            "should equal to 1, but received Input(\"X\").size() = %u.",
-            op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("Y").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Input(\"Y\").size() "
-            "should equal to 1, but received Input(\"Y\").size() = %u.",
-            op_desc.Input("Y").size()));  // Y is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Output("Out").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Output(\"Out\").size() "
-            "should equal to 1, but reveceid Output(\"Out\").size() = %u.",
-            op_desc.Output("Out").size()));
-
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
     PADDLE_ENFORCE_NOT_NULL(
@@ -193,25 +178,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     nvinfer1::ILayer* layer = nullptr;
 
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("X").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Input(\"X\").size() "
-            "should equal to 1, but received Input(\"X\").size() = %u.",
-            op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("Y").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Input(\"Y\").size() "
-            "should equal to 1, but received Input(\"Y\").size() = %u.",
-            op_desc.Input("Y").size()));  // Y is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Output("Out").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Output(\"Out\").size() "
-            "should equal to 1, but received Output(\"Out\").size() = %u.",
-            op_desc.Output("Out").size()));
-
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
     std::vector<nvinfer1::ITensor*> itensors;
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index f13f1724541239..66a682db07b911 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -34,13 +34,17 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     VLOG(4) << "convert fluid EmbEltwiseLayerNorm op to tensorrt layer";
 
     framework::OpDesc op_desc(op, nullptr);
-    auto id_names = op_desc.Input("Ids");
-    auto emb_names = op_desc.Input("Embs");
+    auto word_id_name = op_desc.Input("WordId").front();
+    auto pos_id_name = op_desc.Input("PosId").front();
+    auto sent_id_name = op_desc.Input("SentId").front();
+    auto word_emb_name = op_desc.Input("WordEmbedding").front();
+    auto pos_emb_name = op_desc.Input("PosEmbedding").front();
+    auto sent_emb_name = op_desc.Input("SentEmbedding").front();
+    std::vector<std::string> id_names = {word_id_name, pos_id_name,
+                                         sent_id_name};
+    std::vector<std::string> emb_names = {word_emb_name, pos_emb_name,
+                                          sent_emb_name};
 
-    PADDLE_ENFORCE_EQ(id_names.size(), emb_names.size(),
-                      platform::errors::InvalidArgument(
-                          "The id and emb size of fused EmbEltwiseLayerNormOp "
-                          "should be same "));
     int input_num = id_names.size();
 
     // Declare inputs
@@ -91,99 +95,96 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     nvinfer1::ILayer* layer = nullptr;
     bool enable_int8 = op_desc.HasAttr("enable_int8");
 
-    if (engine_->with_dynamic_shape()) {
-      if (engine_->use_oss()) {
-        int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
-        if (enable_int8) {
-          output_fp16 = 1;
-        }
-        PADDLE_ENFORCE_EQ(
-            output_fp16, 1,
-            platform::errors::InvalidArgument(
-                "Only Precision::KHalf(fp16) is supported when infering "
-                "ernie(bert) model with config.EnableTensorRtOSS(). "
-                "But Precision::KFloat32 is setted."));
-        const std::vector<nvinfer1::PluginField> fields{
-            {"bert_embeddings_layernorm_beta", bias,
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(bias_size)},
-            {"bert_embeddings_layernorm_gamma", scale,
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(scale_size)},
-            {"bert_embeddings_word_embeddings", input_embs[0],
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(emb_sizes[0])},
-            {"bert_embeddings_token_type_embeddings", input_embs[2],
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(emb_sizes[2])},
-            {"bert_embeddings_position_embeddings", input_embs[1],
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(emb_sizes[1])},
-            {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1},
-        };
-
-        // remember to free
-        nvinfer1::PluginFieldCollection* plugin_ptr =
-            static_cast<nvinfer1::PluginFieldCollection*>(
-                malloc(sizeof(*plugin_ptr) +
-                       fields.size() * sizeof(nvinfer1::PluginField)));
-        plugin_ptr->nbFields = static_cast<int>(fields.size());
-        plugin_ptr->fields = fields.data();
-
-        std::vector<nvinfer1::ITensor*> plugin_inputs;
-        plugin_inputs.emplace_back(engine_->GetITensor(
-            engine_->network()->getInput(0)->getName()));  // word_embedding,
-                                                           // eval_placeholder_0
-        plugin_inputs.emplace_back(engine_->GetITensor(
-            engine_->network()->getInput(1)->getName()));  // sent_embedding,
-                                                           // eval_placeholder_1
-        plugin_inputs.emplace_back(engine_->GetITensor(
-            engine_->network()->getInput(2)->getName()));  // cu_seqlens,
-                                                           // eval_placeholder_2
-        auto max_seqlen_tensor =
-            engine_->GetITensor(engine_->network()->getInput(3)->getName());
-        auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, Shuffle,
-            *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
-        nvinfer1::Dims shape_dim;
-        shape_dim.nbDims = 1;
-        shape_dim.d[0] = -1;
-        shuffle_layer->setReshapeDimensions(shape_dim);
-        plugin_inputs.emplace_back(
-            shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
-
-        auto creator = GetPluginRegistry()->getPluginCreator(
-            "CustomEmbLayerNormPluginDynamic", "2");
-
-        auto plugin_obj = creator->createPlugin(
-            "CustomEmbLayerNormPluginDynamic", plugin_ptr);
-        auto plugin_layer = engine_->network()->addPluginV2(
-            plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
-        layer = plugin_layer;
-        free(plugin_ptr);
-        auto output_name = op_desc.Output("Out")[0];
-        RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm",
-                                 {output_name, std::string("qkv_plugin_mask")},
-                                 test_mode);
-      } else {
-        bool with_fp16 =
-            engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
-        float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
-        plugin::DynamicPluginTensorRT* plugin = nullptr;
-        plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
-            input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-            eps, with_fp16);
-        layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin);
-        auto output_name = op_desc.Output("Out")[0];
-        RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},
-                                 test_mode);
+    if (engine_->use_oss()) {
+      int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
+      if (enable_int8) {
+        output_fp16 = 1;
       }
+      PADDLE_ENFORCE_EQ(
+          input_num, 3,
+          platform::errors::InvalidArgument(
+              "When using oss and var-len, embedding_eltwise_layernorm op"
+              "should have 3 inputs only, but got %d.",
+              input_num));
+      PADDLE_ENFORCE_EQ(
+          output_fp16, 1,
+          platform::errors::InvalidArgument(
+              "Only Precision::KHalf(fp16) is supported when infering "
+              "ernie(bert) model with config.EnableTensorRtOSS(). "
+              "But Precision::KFloat32 is setted."));
+      const std::vector<nvinfer1::PluginField> fields{
+          {"bert_embeddings_layernorm_beta", bias,
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(bias_size)},
+          {"bert_embeddings_layernorm_gamma", scale,
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(scale_size)},
+          {"bert_embeddings_word_embeddings", input_embs[0],
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(emb_sizes[0])},
+          {"bert_embeddings_token_type_embeddings", input_embs[2],
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(emb_sizes[2])},
+          {"bert_embeddings_position_embeddings", input_embs[1],
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(emb_sizes[1])},
+          {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1},
+      };
+
+      // remember to free
+      nvinfer1::PluginFieldCollection* plugin_ptr =
+          static_cast<nvinfer1::PluginFieldCollection*>(
+              malloc(sizeof(*plugin_ptr) +
+                     fields.size() * sizeof(nvinfer1::PluginField)));
+      plugin_ptr->nbFields = static_cast<int>(fields.size());
+      plugin_ptr->fields = fields.data();
+
+      std::vector<nvinfer1::ITensor*> plugin_inputs;
+      plugin_inputs.emplace_back(
+          engine_->GetITensor(word_id_name));  // word_embedding,
+                                               // eval_placeholder_0
+      plugin_inputs.emplace_back(
+          engine_->GetITensor(sent_id_name));  // sent_embedding,
+                                               // eval_placeholder_1
+      plugin_inputs.emplace_back(
+          engine_->GetITensor(pos_id_name));  // cu_seqlens,
+                                              // eval_placeholder_2
+      auto max_seqlen_tensor =
+          engine_->GetITensor(engine_->network()->getInput(3)->getName());
+      auto* shuffle_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *max_seqlen_tensor);
+      nvinfer1::Dims shape_dim;
+      shape_dim.nbDims = 1;
+      shape_dim.d[0] = -1;
+      shuffle_layer->setReshapeDimensions(shape_dim);
+      plugin_inputs.emplace_back(
+          shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
+
+      auto creator = GetPluginRegistry()->getPluginCreator(
+          "CustomEmbLayerNormPluginDynamic", "2");
+
+      auto plugin_obj =
+          creator->createPlugin("CustomEmbLayerNormPluginDynamic", plugin_ptr);
+      auto plugin_layer = engine_->network()->addPluginV2(
+          plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
+      layer = plugin_layer;
+      free(plugin_ptr);
+      auto output_name = op_desc.Output("Out")[0];
+      RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm",
+                               {output_name, std::string("qkv_plugin_mask")},
+                               test_mode);
     } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the Ernie(Bert) model in static"
-          "shape mode, which is not supported for the time being.\n"
-          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
-          " to set the shape information to run the dynamic shape mode."));
+      bool with_fp16 =
+          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+      float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
+      plugin::DynamicPluginTensorRT* plugin = nullptr;
+      plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
+          input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
+          eps, with_fp16);
+      layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin);
+      auto output_name = op_desc.Output("Out")[0];
+      RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},
+                               test_mode);
     }
 
 #else
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 194d76c737c7f9..6167e68df2b673 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -160,66 +160,67 @@ class FcOpConverter : public OpConverter {
     if (engine_->with_dynamic_shape()) {
       // not NCHW layout, but NLP layout with added 'x 1 x 1'
       auto x_dim = X->getDimensions();
-      if (x_dim.nbDims == 3 || x_dim.nbDims == 2) {
-        auto output_name = op_desc.Output("Out").front();
-        // add shuffle before fc
-        nvinfer1::Dims reshape_before_fc_dim;
-        reshape_before_fc_dim.nbDims = x_dim.nbDims + 2;
-        for (int i = 0; i < x_dim.nbDims; i++) {
-          reshape_before_fc_dim.d[i] = 0;
-        }
-        reshape_before_fc_dim.d[x_dim.nbDims] = 1;
-        reshape_before_fc_dim.d[x_dim.nbDims + 1] = 1;
-        auto* reshape_before_fc_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
-        reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
-        reshape_before_fc_layer->setName(
-            ("shuffle_before_fc(Output: " + output_name + ")").c_str());
+      if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 &&
+          x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 2) {
+        // fc which is just after self attention
+        regist_fc(X, n_output, weight, bias);
+        return;
+      }
+      PADDLE_ENFORCE_LE(
+          x_dim.nbDims - x_num_col_dims, 3,
+          platform::errors::InvalidArgument(
+              "Params and input dims mismatch. Paddle-TRT FC "
+              "converter expects x_dim.nbDims - x_num_col_dims <= 3, but "
+              "x_dim.nbDims = %d, x_num_col_dims = %d.",
+              x_dim.nbDims, x_num_col_dims));
+      auto output_name = op_desc.Output("Out").front();
+      // add shuffle before fc
+      nvinfer1::Dims reshape_before_fc_dim;
+      // padding shape "x 1 x 1"
+      int padding_length = 3 - (x_dim.nbDims - x_num_col_dims);
+      reshape_before_fc_dim.nbDims = x_dim.nbDims + padding_length;
+      int cur_dim_index = reshape_before_fc_dim.nbDims - 1;
+      while (padding_length-- > 0) {
+        reshape_before_fc_dim.d[cur_dim_index--] = 1;
+      }
+      while (cur_dim_index >= 0) {
+        reshape_before_fc_dim.d[cur_dim_index--] = 0;
+      }
 
-        // add fc layer
-        auto* fc_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0),
-            n_output, weight.get(), bias.get());
-        fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str());
+      auto* reshape_before_fc_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
+      reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
+      reshape_before_fc_layer->setName(
+          ("shuffle_before_fc(Output: " + output_name + ")").c_str());
 
-        // add shuffle after fc
-        nvinfer1::Dims reshape_after_fc_dim;
-        if (x_dim.nbDims == 3) {
-          if (x_num_col_dims == 2) {
-            reshape_after_fc_dim.nbDims = 3;
-            reshape_after_fc_dim.d[0] = 0;
-            reshape_after_fc_dim.d[1] = 0;
-            reshape_after_fc_dim.d[2] = 0;
-          } else {
-            reshape_after_fc_dim.nbDims = 2;
-            reshape_after_fc_dim.d[0] = 0;
-            auto dim = fc_layer->getOutput(0)->getDimensions();
-            reshape_after_fc_dim.d[1] = dim.d[1] * dim.d[2];
-          }
-          // x_dim.nbDims == 2
-        } else {
-          reshape_after_fc_dim.nbDims = 2;
-          reshape_after_fc_dim.d[0] = 0;
-          reshape_after_fc_dim.d[1] = 0;
-        }
-        auto* reshape_after_fc_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
-        reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
+      // add fc layer
+      auto* fc_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0),
+          n_output, weight.get(), bias.get());
+      fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str());
 
-        if (activation_type == "relu") {
-          reshape_after_fc_layer->setName(
-              ("shuffle_after_fc(Output: " + output_name + ")").c_str());
-          nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER(
-              engine_, Activation, *(reshape_after_fc_layer->getOutput(0)),
-              nvinfer1::ActivationType::kRELU);
-          RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle",
-                                   {output_name}, test_mode);
-        } else {
-          RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc",
-                                   {output_name}, test_mode);
-        }
+      // add shuffle after fc
+      nvinfer1::Dims reshape_after_fc_dim;
+      reshape_after_fc_dim.nbDims = x_num_col_dims + 1;
+      for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) {
+        reshape_after_fc_dim.d[i] = 0;
+      }
+
+      auto* reshape_after_fc_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
+      reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
+
+      if (activation_type == "relu") {
+        reshape_after_fc_layer->setName(
+            ("shuffle_after_fc(Output: " + output_name + ")").c_str());
+        nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, Activation, *(reshape_after_fc_layer->getOutput(0)),
+            nvinfer1::ActivationType::kRELU);
+        RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle",
+                                 {output_name}, test_mode);
       } else {
-        regist_fc(X, n_output, weight, bias);
+        RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc",
+                                 {output_name}, test_mode);
       }
       return;
     }
diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
index ca5b6a8b52e797..0436499cd40756 100644
--- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
@@ -47,15 +47,7 @@ class GeluOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     int input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE_EQ(input_num, 1,
-                      platform::errors::InvalidArgument(
-                          "gelu op has only 1 input, but got %d", input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE_EQ(output_num, 1,
-                      platform::errors::InvalidArgument(
-                          "gelu op has only 1 output, but got %d", output_num));
 
     nvinfer1::ILayer* layer = nullptr;
     if (engine_->with_dynamic_shape()) {
diff --git a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
index 9dc40ceec48094..7ef79e547d09ab 100644
--- a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
@@ -41,17 +41,7 @@ class HardSwishOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     int input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE_EQ(
-        input_num, 1,
-        platform::errors::InvalidArgument(
-            "HardSwish op has only 1 input, but got %d", input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE_EQ(
-        output_num, 1,
-        platform::errors::InvalidArgument(
-            "HardSwish op has only 1 output, but got %d", output_num));
 
     const float threshold =
         op_desc.HasAttr("threshold")
diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
index c1f266bacfec57..0b97b5d87a3d50 100644
--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
@@ -25,25 +25,6 @@ class LayerNormOpConverter : public OpConverter {
                   const framework::Scope& scope, bool test_mode) override {
     VLOG(4) << "convert a fluid layer_norm op to tensorrt layer_norm plugin";
     framework::OpDesc op_desc(op, nullptr);
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("X").size(), 1,
-        platform::errors::InvalidArgument(
-            "input of layer_norm op converter should be 1, got %d",
-            op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Bias of layer_norm op converter should be 1, got %d",
-                          op_desc.Input("Bias").size()));  // Bias is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("Scale").size(), 1,
-        platform::errors::InvalidArgument(
-            "Scale of layer_norm op converter should be 1, got %d",
-            op_desc.Input("Scale").size()));  // Scale is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Output("Y").size(), 1,
-        platform::errors::InvalidArgument(
-            "output of layer_norm op converter should be 1, got %d",
-            op_desc.Input("Y").size()));
 
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front());
diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
index c2ffb3f3197c15..d6277b5208d5a1 100644
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
@@ -36,21 +36,7 @@ class LeakyReluOpConverter : public OpConverter {
     VLOG(4) << "convert fluid leaky_relu op to tensorrt layer";
 
     framework::OpDesc op_desc(op, nullptr);
-    // Declare inputs
-    size_t input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE_EQ(input_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid number of TRT leaky_relu op converter "
-                          "inputs. Expected 1, but received %d",
-                          input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE_EQ(output_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid number of TRT leaky_relu op converter "
-                          "outputs. Expected 1, but received %d",
-                          output_num));
     // Get attrs
     float alpha = BOOST_GET_CONST(float, op_desc.GetAttr("alpha"));
     nvinfer1::ILayer* output_layer = nullptr;
diff --git a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
index e91a2ee13f4c2d..3940cc5dce1b00 100644
--- a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
@@ -65,13 +65,6 @@ class NearestInterpolateOpConverter : public OpConverter {
       scale_w = scale;
     } else {
       // axis are different in static/dynamic mode
-      PADDLE_ENFORCE_GT(
-          out_h, 0, platform::errors::InvalidArgument(
-                        "out_h must be greater than 0 if scale is not set."));
-      PADDLE_ENFORCE_GT(
-          out_w, 0, platform::errors::InvalidArgument(
-                        "out_w must be greater than 0 if scale is not set."));
-
       bool with_dynamic = engine_->with_dynamic_shape();
 
       int h_axis = (data_layout == framework::DataLayout::kNCHW) + with_dynamic;
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index 8de16df0a2f610..f72ae2c3ec2d7e 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -109,6 +109,12 @@ class OpConverter {
           it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
                                               op_desc.Type()));
     }
+    if (op_desc.Type() == "depthwise_conv2d_transpose") {
+      it = Registry<OpConverter>::Global().Lookup("conv2d_transpose");
+      PADDLE_ENFORCE_NOT_NULL(
+          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
+    }
     if (op_desc.Type() == "transpose2") {
       it = Registry<OpConverter>::Global().Lookup("transpose");
       PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
index 6bf50e4742dd28..d6711bbbd2cb52 100644
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
@@ -43,8 +43,6 @@ class PadOpConverter : public OpConverter {
 
     const std::vector<int> paddings =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("paddings"));
-    const float pad_value =
-        BOOST_GET_CONST(float, op_desc.GetAttr("pad_value"));
 
     nvinfer1::Dims input_shape = input->getDimensions();
     int nbDims = input_shape.nbDims;
@@ -62,9 +60,6 @@ class PadOpConverter : public OpConverter {
                                           "(nbDims + 1) * 2 == pad_size. But "
                                           "received nbDims:%d, pad_size:%d.",
                                           nbDims, pad_size));
-    PADDLE_ENFORCE_EQ(pad_value, 0.0,
-                      platform::errors::InvalidArgument(
-                          "The pad layer of TRT only support zero."));
 
     nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]);
     nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]);
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index c10072602d7c51..90d6392fd6404e 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -66,15 +66,6 @@ class Pool2dOpConverter : public OpConverter {
     VLOG(4)
         << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
     framework::OpDesc op_desc(op, nullptr);
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL,
-                      platform::errors::InvalidArgument(
-                          "TRT Pool2d expect 1 input, but got %d input.",
-                          op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL,
-                      platform::errors::InvalidArgument(
-                          "TRT Pool2d expect 1 Output, but got %d output.",
-                          op_desc.Output("Out").size()));
-
     auto *input1 = engine_->GetITensor(op_desc.Input("X")[0]);
     nvinfer1::Dims input_shape = input1->getDimensions();
     int input_dims = input_shape.nbDims;
@@ -110,10 +101,6 @@ class Pool2dOpConverter : public OpConverter {
       nv_pool_type = nvinfer1::PoolingType::kAVERAGE;
       reduce_operation = nvinfer1::ReduceOperation::kAVG;
       plugin_pool_type = plugin::PoolPlugin::PoolType::avg;
-    } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "Wrong pool op type, the trt do not support the %s pool type.",
-          pool_type));
     }
 
     nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
index 74d77d8be44937..a8a36e1238168a 100644
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -31,19 +31,7 @@ class PReluOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     size_t input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE_EQ(input_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid input X's size of prelu TRT converter. "
-                          "Expected 1, received %d.",
-                          input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE_EQ(output_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid output Out's size of prelu TRT converter. "
-                          "Expected 1, received %d.",
-                          output_num));
     // Get attrs
     std::string mode = BOOST_GET_CONST(std::string, op_desc.GetAttr("mode"));
     //
diff --git a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
index 1329608aecd205..654fe7e0133796 100644
--- a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
@@ -62,12 +62,6 @@ class RoiAlignOpConverter : public OpConverter {
     std::vector<nvinfer1::ITensor*> inputs{input_tensor, rois_tensor};
     nvinfer1::ILayer* layer = nullptr;
 
-    PADDLE_ENFORCE_EQ(
-        engine_->with_dynamic_shape(), true,
-        platform::errors::InvalidArgument(
-            "TRT roi align plugin only accept the dynamic shape, because that "
-            "the roi_align will change the batch size."));
-
     auto* roi_align_plugin = new plugin::RoiAlignPluginDynamic(
         data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio);
     auto roi_align_layer = engine_->network()->addPluginV2(
diff --git a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
index bf1f82076a66ce..0fdc262f7e740b 100644
--- a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
@@ -50,12 +50,6 @@ class ShuffleChannelOpConverter : public OpConverter {
     int w = input_dims.d[2];
     int group = BOOST_GET_CONST(int, op_desc.GetAttr("group"));
 
-    if (engine_->with_dynamic_shape()) {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the TRT Dynamic Shape mode, "
-          "the shuffle_channel op does not support dynamic shape yet"));
-    }
-
     auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
     nvinfer1::Dims4 reshape_dim(group, c / group, h, w);
     layer->setReshapeDimensions(reshape_dim);
diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
index b44bdcef7123c2..e621ac0514109d 100644
--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@@ -52,57 +52,57 @@ class SkipLayerNormOpConverter : public OpConverter {
     bool enable_int8 = op_desc.HasAttr("enable_int8");
 
     nvinfer1::ILayer* layer = nullptr;
-    if (engine_->with_dynamic_shape()) {
-      if (engine_->use_oss()) {
-        auto creator = GetPluginRegistry()->getPluginCreator(
-            "CustomSkipLayerNormPluginDynamic", "2");
-        assert(creator != nullptr);
-        int type = static_cast<int>((engine_->WithFp16() == 1)
-                                        ? nvinfer1::DataType::kHALF
-                                        : nvinfer1::DataType::kFLOAT);
-        int ld = input1->getDimensions().d[2];  // hidden dimension
-        assert(ld > 0);
-
-        if (enable_int8) {
-          type = static_cast<int>(nvinfer1::DataType::kHALF);
-        }
-
-        const std::vector<nvinfer1::PluginField> fields{
-            {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
-            {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1},
-            {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size},
-            {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size},
-        };
-        nvinfer1::PluginFieldCollection* pluginPtr =
-            static_cast<nvinfer1::PluginFieldCollection*>(
-                malloc(sizeof(*pluginPtr) +
-                       fields.size() *
-                           sizeof(nvinfer1::PluginField)));  // remember to free
-        pluginPtr->nbFields = static_cast<int>(fields.size());
-        pluginPtr->fields = fields.data();
-
-        auto pluginObj = creator->createPlugin(
-            "CustomSkipLayerNormPluginDynamic", pluginPtr);
-        auto plugin_layer = engine_->network()->addPluginV2(
-            inputs.data(), inputs.size(), *pluginObj);
-
-        assert(plugin_layer != nullptr);
-        layer = plugin_layer;
-      } else {
-        float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
-        bool with_fp16 =
-            engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
-        plugin::SkipLayerNormPluginDynamic* plugin =
-            new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size,
-                                                   scale_size, eps, with_fp16);
-        layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin);
+
+    if (engine_->use_oss()) {
+      auto creator = GetPluginRegistry()->getPluginCreator(
+          "CustomSkipLayerNormPluginDynamic", "2");
+      PADDLE_ENFORCE_NE(
+          creator, nullptr,
+          platform::errors::InvalidArgument(
+              "fail to get creator of CustomSkipLayerNormPluginDynamic"));
+      int type = static_cast<int>((engine_->WithFp16() == 1)
+                                      ? nvinfer1::DataType::kHALF
+                                      : nvinfer1::DataType::kFLOAT);
+      int ld = input1->getDimensions().d[2];  // hidden dimension
+      PADDLE_ENFORCE_GT(ld, 0, platform::errors::InvalidArgument(
+                                   "in CustomSkipLayerNormPluginDynamic hidden "
+                                   "dimension should > 0"));
+      if (enable_int8) {
+        type = static_cast<int>(nvinfer1::DataType::kHALF);
       }
+
+      const std::vector<nvinfer1::PluginField> fields{
+          {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
+          {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1},
+          {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size},
+          {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size},
+      };
+      nvinfer1::PluginFieldCollection* pluginPtr =
+          static_cast<nvinfer1::PluginFieldCollection*>(
+              malloc(sizeof(*pluginPtr) +
+                     fields.size() *
+                         sizeof(nvinfer1::PluginField)));  // remember to free
+      pluginPtr->nbFields = static_cast<int>(fields.size());
+      pluginPtr->fields = fields.data();
+
+      auto pluginObj =
+          creator->createPlugin("CustomSkipLayerNormPluginDynamic", pluginPtr);
+      auto plugin_layer = engine_->network()->addPluginV2(
+          inputs.data(), inputs.size(), *pluginObj);
+
+      PADDLE_ENFORCE_NE(
+          plugin_layer, nullptr,
+          platform::errors::InvalidArgument(
+              "fail to add CustomSkipLayerNormPluginDynamic layer"));
+      layer = plugin_layer;
     } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the Ernie(Bert) model in static"
-          "shape mode, which is not supported for the time being.\n"
-          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
-          " to set the shape information to run the dynamic shape mode."));
+      float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
+      bool with_fp16 =
+          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+      plugin::SkipLayerNormPluginDynamic* plugin =
+          new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size,
+                                                 scale_size, eps, with_fp16);
+      layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin);
     }
 
     auto output_name = op_desc.Output("Out")[0];
diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
index aee39b7cf0c14c..2ab024dff327fd 100644
--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -44,15 +44,6 @@ class SliceOpConverter : public OpConverter {
     std::vector<int> ends =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("ends"));
 
-    PADDLE_ENFORCE_EQ(
-        starts.size(), axes.size(),
-        platform::errors::InvalidArgument(
-            "The size of starts must be equal to the size of axes."));
-    PADDLE_ENFORCE_EQ(
-        ends.size(), axes.size(),
-        platform::errors::InvalidArgument(
-            "The size of ends must be equal to the size of axes."));
-
     auto input_dims = input->getDimensions();
     if (!engine_->with_dynamic_shape()) {
       // notice that input shape is [CHW] without batch axis when input has
@@ -62,10 +53,6 @@ class SliceOpConverter : public OpConverter {
       }
       input_dims.d[0] = 1;  // fake batchsize, not useful here
       for (size_t i = 0; i < axes.size(); i++) {
-        // split on batch is not supported in TensorRT
-        PADDLE_ENFORCE_NE(axes[i], 0, platform::errors::InvalidArgument(
-                                          "Invalid slice axis. Slice on batch "
-                                          "axis is not supported in TensorRT"));
         if (starts[i] < 0) {
           starts[i] = std::max(starts[i] + input_dims.d[axes[i]], 0);
         }
diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 75b317e7bfd90e..47a6dd783a70cf 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -33,17 +33,7 @@ class SplitOpConverter : public OpConverter {
     size_t output_num = op_desc.Output("Out").size();
 
     // Get Attrs
-    PADDLE_ENFORCE_EQ(input_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid input X's size of split TRT converter. "
-                          "Expected 1, received %d.",
-                          input_num));
     int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
-    // split on batch is not supported in TensorRT
-    PADDLE_ENFORCE_NE(
-        axis, 0,
-        platform::errors::InvalidArgument(
-            "Invalid split axis. Split on batch is not supported in TensorRT"));
 
     std::vector<int> output_lengths =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("sections"));
diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
index a0292b21124633..6105e10799e552 100644
--- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
@@ -58,26 +58,19 @@ class StackOpConverter : public OpConverter {
     }
 
     nvinfer1::ILayer* layer = nullptr;
-    if (engine_->with_dynamic_shape()) {
 #if IS_TRT_VERSION_GE(6000)
-      bool with_fp16 =
-          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
-      plugin::StackPluginDynamic* plugin =
-          new plugin::StackPluginDynamic(axis, input_num, with_fp16);
-      layer = engine_->AddDynamicPlugin(inputs, input_num, plugin);
-      assert(layer != nullptr);
+    bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+    plugin::StackPluginDynamic* plugin =
+        new plugin::StackPluginDynamic(axis, input_num, with_fp16);
+    layer = engine_->AddDynamicPlugin(inputs, input_num, plugin);
+    PADDLE_ENFORCE_NOT_NULL(
+        layer, platform::errors::InvalidArgument(
+                   "trt stack layer in converter could not be created."));
 #else
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the TRT Dynamic Shape mode, need to confirm that "
-          "your TRT version is no less than 6.0"));
+    PADDLE_THROW(platform::errors::Fatal(
+        "You are running the TRT Dynamic Shape mode, need to confirm that "
+        "your TRT version is no less than 6.0"));
 #endif
-    } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the Ernie(Bert) model in static"
-          "shape mode, which is not supported for the time being.\n"
-          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
-          " to set the shape information to run the dynamic shape mode."));
-    }
     auto output_name = op_desc.Output("Y").front();
     RreplenishLayerAndOutput(layer, "stack", {output_name}, test_mode);
     free(inputs);
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index 971f99e6919722..6158fd130bad8d 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -60,6 +60,9 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
 static nvinfer1::IPluginRegistry* GetPluginRegistry() {
   return static_cast<nvinfer1::IPluginRegistry*>(dy::getPluginRegistry());
 }
+static int GetInferLibVersion() {
+  return static_cast<int>(dy::getInferLibVersion());
+}
 #endif
 
 // A logger for create TensorRT infer builder.
@@ -67,9 +70,12 @@ class NaiveLogger : public nvinfer1::ILogger {
  public:
   void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
     switch (severity) {
-      case Severity::kINFO:
+      case Severity::kVERBOSE:
         VLOG(3) << msg;
         break;
+      case Severity::kINFO:
+        VLOG(2) << msg;
+        break;
       case Severity::kWARNING:
         LOG(WARNING) << msg;
         break;
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 53225b79780773..48c7b7fdd0d79d 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -42,15 +42,13 @@ struct SimpleOpTypeSetTeller : public Teller {
     teller_set.insert("multihead_matmul");
     teller_set.insert("skip_layernorm");
     teller_set.insert("slice");
-#endif
-#if IS_TRT_VERSION_GE(7130)
-    teller_set.insert("group_norm");
+    int8_teller_set.insert("fused_embedding_eltwise_layernorm");
     int8_teller_set.insert("multihead_matmul");
     int8_teller_set.insert("skip_layernorm");
-    int8_teller_set.insert("fused_embedding_eltwise_layernorm");
-    int8_teller_set.insert("matmul");
-    int8_teller_set.insert("stack");
     int8_teller_set.insert("slice");
+#endif
+#if IS_TRT_VERSION_GE(7130)
+    teller_set.insert("group_norm");
 #endif
   }
 
@@ -67,6 +65,8 @@ struct SimpleOpTypeSetTeller : public Teller {
   // use this set for no calib int8.
   std::unordered_set<std::string> int8_teller_set{"mul",
                                                   "conv2d",
+                                                  "matmul",
+                                                  "stack",
                                                   "conv2d_fusion",
                                                   "pool2d",
                                                   "relu",
@@ -102,6 +102,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "dropout",
       "prelu",
       "conv2d_transpose",
+      "depthwise_conv2d_transpose",
       "leaky_relu",
       "fc",
       "shuffle_channel",
@@ -137,13 +138,95 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     return false;
 
   for (auto& teller : tellers_) {
-    if (op_type == "pool2d" || op_type == "conv2d" ||
-        op_type == "depthwise_conv2d" || op_type == "conv2d_transpose") {
+    if (op_type == "depthwise_conv2d") {
       std::vector<int> paddings =
           BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
 
       if (paddings.size() > 2) return false;
     }
+
+    if (op_type == "pool2d") {
+      std::vector<int> paddings =
+          BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
+      if (paddings.size() > 2) return false;
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "TRT Pool2d expect 1 input, but got "
+                << desc.Input("X").size();
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "TRT Pool2d has only 1 output, but got "
+                << desc.Output("Out").size();
+        return false;
+      }
+      if (!desc.HasAttr("pooling_type")) {
+        return false;
+      } else {
+        std::string pool_type =
+            BOOST_GET_CONST(std::string, desc.GetAttr("pooling_type"));
+        if (pool_type != "max" && pool_type != "avg") {
+          VLOG(3) << "Wrong pool op type, the trt do not support the "
+                  << pool_type << " pool type.";
+          return false;
+        }
+      }
+    }
+
+    if (op_type == "conv2d" || op_type == "conv2d_transpose" ||
+        op_type == "conv2d_fusion" || op_type == "depthwise_conv2d" ||
+        op_type == "depthwise_conv2d_transpose") {
+      std::vector<int> paddings =
+          BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
+
+      // conv2d and conv2d_transpose need padding check
+      if (paddings.size() > 2 && op_type != "conv2d_fusion") return false;
+
+      if (desc.Input("Input").size() != 1) {
+        VLOG(3) << "TRT Conv2d expect 1 input, but got "
+                << desc.Input("Input").size() << " input.";
+        return false;
+      }
+
+      if (desc.Input("Filter").size() != 1) {
+        VLOG(3) << "TRT Conv2d expect 1 filter, but got "
+                << desc.Input("Filter").size() << " filter.";
+        return false;
+      }
+
+      if (desc.HasAttr("enable_int8")) {
+        if (op_type == "conv2d" || op_type == "conv2d_fusion") {
+          if (!desc.HasAttr("Input_scale")) {
+            VLOG(3) << "Input scale not found. TRT int8"
+                       " requires conv/deconv to have "
+                       "input quantization scales.";
+            return false;
+          }
+        }
+      }
+
+      if (op_type == "conv2d_transpose" ||
+          op_type == "depthwise_conv2d_transpose") {
+        if (!desc.HasAttr("dilations")) {
+          return false;
+        } else {
+          const std::vector<int> dilations =
+              BOOST_GET_CONST(std::vector<int>, desc.GetAttr("dilations"));
+          if (dilations[0] != 1 || dilations[1] != 1) {
+            VLOG(3) << "In conv2d_transpose, Dilations must be (1, 1) for "
+                       "tensorRT, but given ("
+                    << dilations[0] << ", " << dilations[1] << ")";
+            return false;
+          }
+        }
+      }
+
+      if (desc.Output("Output").size() != 1) {
+        VLOG(3) << "TRT Conv2d expect 1 output, but got "
+                << desc.Output("Output").size() << " output.";
+        return false;
+      }
+    }
+
     if (op_type == "matmul") {
       auto* block = desc.Block();
       for (auto& param_name : desc.Inputs()) {
@@ -151,7 +234,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
           auto* var_desc = block->FindVar(var_name);
           const auto shape = var_desc->GetShape();
           if (shape.size() < 3) {
-            VLOG(1)
+            VLOG(3)
                 << "matmul op dims < 3 not supported in tensorrt, but got dims "
                 << shape.size() << ", so jump it.";
             return false;
@@ -189,7 +272,18 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         if (axis.size() >= nvinfer1::Dims::MAX_DIMS) return false;
       }
     }
-    if (op_type == "flatten2" || op_type == "flatten") {
+    if (op_type == "flatten2") {
+      // flatten doesn't support dynamic shape currently
+      if (!desc.HasAttr("axis")) {
+        return false;
+      } else {
+        if (with_dynamic_shape) return false;
+        int axis = BOOST_GET_CONST(int, desc.GetAttr("axis"));
+        if (axis != 1) return false;
+      }
+    }
+
+    if (op_type == "flatten") {
       // flatten doesn't support dynamic shape currently
       if (!desc.HasAttr("axis")) {
         return false;
@@ -229,7 +323,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
           auto* var_desc = block->FindVar(var_name);
           const auto shape = var_desc->GetShape();
           if (shape.size() != 3) {
-            VLOG(1) << "multiclass_nms op dims != 3 not supported in tensorrt, "
+            VLOG(3) << "multiclass_nms op dims != 3 not supported in tensorrt, "
                        "but got dims "
                     << shape.size() << ", so jump it.";
             return false;
@@ -252,18 +346,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (registry == nullptr) return false;
     }
 
-    if (op_type == "fc" || op_type == "mul") {
-      const int x_num_col_dims =
-          desc.HasAttr("x_num_col_dims")
-              ? BOOST_GET_CONST(int, desc.GetAttr("x_num_col_dims"))
-              : (desc.HasAttr("in_num_col_dims")
-                     ? BOOST_GET_CONST(int, desc.GetAttr("in_num_col_dims"))
-                     : 1);
-      if (x_num_col_dims != 1 && x_num_col_dims != 2) {
-        return false;
-      }
-    }
-
     if (op_type == "nearest_interp") {
       std::vector<std::string> attrs{"data_layout",   "interp_method",
                                      "align_corners", "scale",
@@ -279,6 +361,25 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       auto interp_method =
           BOOST_GET_CONST(std::string, desc.GetAttr("interp_method"));
       if (interp_method != "nearest") return false;
+
+      if (!desc.HasAttr("scale") || !desc.HasAttr("out_h") ||
+          !desc.HasAttr("out_w")) {
+        return false;
+      } else {
+        auto scale = BOOST_GET_CONST(float, desc.GetAttr("scale"));
+        auto out_h = BOOST_GET_CONST(int, desc.GetAttr("out_h"));
+        auto out_w = BOOST_GET_CONST(int, desc.GetAttr("out_w"));
+        if (!(scale > 0.f && (out_h <= 0 && out_w <= 0))) {
+          if (out_h <= 0) {
+            VLOG(3) << "out_h must be greater than 0 if scale is not set.";
+            return false;
+          }
+          if (out_w <= 0) {
+            VLOG(3) << "out_w must be greater than 0 if scale is not set.";
+            return false;
+          }
+        }
+      }
     }
 
     if (op_type == "roi_align") {
@@ -303,6 +404,235 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (spatial_scale <= 0.f) return false;
     }
 
+    if (op_type == "hard_swish") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "HardSwish op has only 1 input, but got "
+                << desc.Input("X").size();
+        return false;
+      }
+
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "HardSwish op has only 1 output, but got "
+                << desc.Output("Out").size();
+        return false;
+      }
+    }
+
+    if (op_type == "batch_norm") {
+      const std::vector<std::string> bn_inputs = {"X", "Bias", "Mean", "Scale",
+                                                  "Variance"};
+      for (unsigned int i = 0; i < bn_inputs.size(); i++) {
+        if (desc.Input(bn_inputs[i]).size() != 1) {
+          VLOG(3) << "Invalid " << bn_inputs[i]
+                  << "'s size of batch_norm TRT "
+                     "converter. Expected 1, received "
+                  << desc.Input(bn_inputs[i]).size() << ".";
+          return false;
+        }
+      }
+
+      if (desc.Output("Y").size() != 1) {
+        VLOG(3) << "Invalid output Y's size of batch_norm TRT "
+                   "converter. Expected 1, received "
+                << desc.Output("Y").size() << ".";
+        return false;
+      }
+    }
+
+    if (op_type == "split") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "Invalid input X's size of split TRT converter. "
+                   "Expected 1, received "
+                << desc.Input("X").size() << ".";
+        return false;
+      }
+      if (!desc.HasAttr("axis")) {
+        return false;
+      } else {
+        int axis = BOOST_GET_CONST(int, desc.GetAttr("axis"));
+        if (axis == 0) {
+          VLOG(3) << "Invalid split axis. Split on batch is not supported in "
+                     "TensorRT";
+          return false;
+        }
+      }
+    }
+
+    if (op_type == "slice") {
+      if (!desc.HasAttr("axes") || !desc.HasAttr("starts") ||
+          !desc.HasAttr("ends")) {
+        return false;
+      } else {
+        std::vector<int> axes =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("axes"));
+        std::vector<int> starts =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("starts"));
+        std::vector<int> ends =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("ends"));
+        if (axes.size() != starts.size() || axes.size() != ends.size()) {
+          return false;
+        }
+        if (!with_dynamic_shape) {
+          for (size_t i = 0; i < axes.size(); i++) {
+            if (axes[i] == 0) {
+              VLOG(3) << "Invalid slice axis. Slice on batch axis is not "
+                         "supported in TensorRT";
+              return false;
+            }
+          }
+        }
+      }
+    }
+
+    if (op_type == "elementwise_add" || op_type == "elementwise_mul") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "The input op's Input(\"X\").size() "
+                   "should equal to 1, but received Input(\"X\").size() = "
+                << desc.Input("X").size() << ".";
+        return false;
+      }
+      if (desc.Input("Y").size() != 1) {
+        VLOG(3) << "The input op's Input(\"Y\").size() "
+                   "should equal to 1, but received Input(\"Y\").size() = "
+                << desc.Input("Y").size() << ".";
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "The input op's Output(\"Out\").size() "
+                   "should equal to 1, but reveceid Output(\"Out\").size() = "
+                << desc.Output("Out").size() << ".";
+        return false;
+      }
+    }
+
+    if (op_type == "stack") {
+      if (!with_dynamic_shape) {
+        VLOG(3)
+            << "static shape mode is not supported for TRT stack.\n"
+               "You can use the config.SetTRTDynamicShapeInfo(...) interface"
+               " to set the shape information to run the dynamic shape "
+               "mode.";
+        return false;
+      }
+    }
+
+    if (op_type == "fused_embedding_eltwise_layernorm") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "fused_embedding_eltwise_layernorm should run on dynamic "
+                   "shape mode.";
+        return false;
+      }
+      if (desc.Input("Ids").size() != desc.Input("Embs").size()) {
+        VLOG(3) << "The id and emb size of fused EmbEltwiseLayerNormOp "
+                   "should be same ";
+        return false;
+      }
+    }
+
+    if (op_type == "gelu") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "gelu op has only 1 input, but got "
+                << desc.Input("X").size();
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "gelu op has only 1 output, but got "
+                << desc.Output("Out").size();
+        return false;
+      }
+    }
+
+    if (op_type == "layer_norm") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "input of layer_norm op converter should be 1, got "
+                << desc.Input("X").size();
+        return false;
+      }
+      if (desc.Input("Bias").size() != 1) {
+        VLOG(3) << "Bias of layer_norm op converter should be 1, got "
+                << desc.Input("Bias").size();
+        return false;
+      }
+      if (desc.Input("Scale").size() != 1) {
+        VLOG(3) << "Scale of layer_norm op converter should be 1, got "
+                << desc.Input("Scale").size();
+        return false;
+      }
+      if (desc.Output("Y").size() != 1) {
+        VLOG(3) << "output of layer_norm op converter should be 1, got "
+                << desc.Output("Y").size();
+        return false;
+      }
+    }
+
+    if (op_type == "leaky_relu") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "Invalid number of TRT leaky_relu op converter "
+                   "inputs. Expected 1, but received "
+                << desc.Input("X").size();
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "output of leaky_relu op converter should be 1, got "
+                << desc.Output("Out").size();
+        return false;
+      }
+    }
+
+    if (op_type == "pad") {
+      const float pad_value = BOOST_GET_CONST(float, desc.GetAttr("pad_value"));
+      if (pad_value != 0.0f) {
+        VLOG(3) << "The pad layer of TRT only support zero.";
+        return false;
+      }
+    }
+
+    if (op_type == "prelu") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "Invalid input X's size of prelu TRT converter. "
+                   "Expected 1, received "
+                << desc.Input("X").size() << ".";
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "Invalid output Out's size of prelu TRT converter. "
+                   "Expected 1, received "
+                << desc.Output("Out").size() << ".";
+        return false;
+      }
+    }
+
+    if (op_type == "roi_align") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "TRT roi align plugin only accept the dynamic shape, "
+                   "because that "
+                   "the roi_align will change the batch size.";
+        return false;
+      }
+    }
+
+    if (op_type == "shuffle_channel") {
+      if (with_dynamic_shape) {
+        VLOG(3) << "You are running the TRT Dynamic Shape mode, "
+                   "the shuffle_channel op does not support dynamic shape yet";
+        return false;
+      }
+    }
+
+    if (op_type == "skip_layernorm") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the skip_layernorm does not support static shape yet";
+        return false;
+      }
+    }
+
+    if (op_type == "multihead_matmul") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the multihead_matmul does not support static shape yet";
+        return false;
+      }
+    }
+
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
   return false;
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index a5fc9e73c5f27f..214e1a81e7dc04 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -225,6 +225,14 @@ nvinfer1::DataType QkvToContextPluginDynamic::getOutputDataType(
   return input_types[0];
 }
 
+template <typename T>
+__global__ void apply_scale(T *data, T scale, int n) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  data[tid] = data[tid] * scale;
+#endif
+}
+
 int QkvToContextPluginDynamic::enqueue(
     const nvinfer1::PluginTensorDesc *input_desc,
     const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
@@ -291,10 +299,17 @@ int QkvToContextPluginDynamic::enqueue(
         platform::DeviceContextPool::Instance().Get(
             platform::CUDAPlace(device_id)));
 
+    int n_q = seq_len * head_number_ * head_size_;
+    constexpr int threads = 128;
+    int blocks = (n_q + threads - 1) / threads;
+
+    apply_scale<<<blocks, threads, 0, stream>>>(tptr, static_cast<half>(scale_),
+                                                n_q);
+
     const platform::CUDADeviceContext &dev_ctx = *device_ctx;
     operators::math::MultiHeadGPUComputeFunctor<half> multihead_compute_func;
     multihead_compute_func(dev_ctx, batch, seq_len, head_number_, head_size_,
-                           qkptr, input1_data, tptr, half(scale_), half(0.0));
+                           qkptr, input1_data, tptr, half(1.), half(0.0));
 
     int grid = batch * head_number_ * seq_len;
     int block = head_size_;
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 75628adbe8a859..f74cd671d6dca0 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -522,10 +522,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
     inference_analysis_test(trt_instance_norm_test SRCS trt_instance_norm_converter_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_INSTANCE_NORM_MODEL}/)
-    inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc
+    inference_analysis_test(test_analyzer_capi_exp_gpu SRCS analyzer_capi_exp_gpu_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-    inference_analysis_test(test_analyzer_capi_xpu SRCS analyzer_capi_xpu_tester.cc
+    inference_analysis_test(test_analyzer_capi_exp_xpu SRCS analyzer_capi_exp_xpu_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
             
@@ -604,14 +604,23 @@ inference_analysis_test(lite_resnet50_test SRCS lite_resnet50_test.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
         ARGS --infer_model=${RESNET50_MODEL_DIR})
 
-inference_analysis_test(test_analyzer_capi SRCS analyzer_capi_tester.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
-            ARGS --infer_model=${RESNET50_MODEL_DIR}/model)
+inference_analysis_test(test_analyzer_capi_exp SRCS analyzer_capi_exp_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
+        ARGS --infer_model=${RESNET50_MODEL_DIR}/model)
+
+inference_analysis_test(test_analyzer_capi_exp_pd_config SRCS analyzer_capi_exp_pd_config_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
+        ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model)
+
+inference_analysis_test(test_analyzer_capi_exp_pd_tensor SRCS analyzer_capi_exp_pd_tensor_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
+        ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model)
 
-inference_analysis_test(test_analyzer_capi_pd_tensor SRCS analyzer_capi_pd_tensor_tester.cc
+if (NOT APPLE AND NOT WIN32)
+    inference_analysis_test(test_analyzer_capi_exp_pd_threads SRCS analyzer_capi_exp_pd_threads_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model)
-
+endif()
 inference_analysis_test(test_analyzer_zerocopytensor_tensor SRCS analyzer_zerocopy_tensor_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${OCR_INSTALL_DIR}/model)        
@@ -621,17 +630,17 @@ inference_analysis_test(test_analyzer_paddletensor_tensor SRCS analyzer_paddle_t
             ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt --refer_result=${OCR_INSTALL_DIR}/result.txt)    
             
 if(WITH_MKLDNN)
-  inference_analysis_test(test_analyzer_capi_int SRCS analyzer_capi_int_tester.cc
+  inference_analysis_test(test_analyzer_capi_exp_int SRCS analyzer_capi_exp_int_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${INT8_DATA_DIR}/resnet50/model)
- endif()
+endif()
 
-inference_analysis_test(test_analyzer_capi_ner SRCS analyzer_capi_ner_tester.cc 
+inference_analysis_test(test_analyzer_capi_exp_ner SRCS analyzer_capi_exp_ner_tester.cc 
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
         ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model)
 
 if(WITH_GPU)
-  inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
+    inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
         ARGS --infer_model=${RESNET50_MODEL_DIR})
 endif()
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
new file mode 100644
index 00000000000000..de9e2afd705f93
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
@@ -0,0 +1,160 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(PD_Config, gpu_interface) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  std::string prog_file = model_dir + "/__model__";
+  std::string param_file = model_dir + "/__params__";
+  std::string opt_cache_dir = FLAGS_infer_model + "/OptimCacheDir";
+  const char* ops_name = "conv_2d";
+
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, prog_file.c_str(), param_file.c_str());
+  PD_ConfigSetOptimCacheDir(config, opt_cache_dir.c_str());
+
+  PD_ConfigEnableUseGpu(config, 100, 0);
+  bool use_gpu = PD_ConfigUseGpu(config);
+  EXPECT_TRUE(use_gpu);
+  int init_size = PD_ConfigMemoryPoolInitSizeMb(config);
+  EXPECT_EQ(init_size, 100);
+  int gpu_device_id = PD_ConfigGpuDeviceId(config);
+  EXPECT_EQ(gpu_device_id, 0);
+  float frac = PD_ConfigFractionOfGpuMemoryForPool(config);
+  LOG(INFO) << frac;
+  PD_ConfigEnableCudnn(config);
+  bool cudnn = PD_ConfigCudnnEnabled(config);
+  EXPECT_TRUE(cudnn);
+
+  PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_INT8, FALSE,
+                                TRUE);
+  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
+  EXPECT_TRUE(trt_enable);
+
+  const char* tensor_name = "image";
+  size_t shapes_num[1] = {4};
+  int32_t min_shape[4] = {1, 3, 36, 36};
+  int32_t max_shape[4] = {1, 3, 224, 224};
+  int32_t opt_shape[4] = {1, 3, 224, 224};
+  int32_t* min_shape_ptr = min_shape;
+  int32_t* max_shape_ptr = max_shape;
+  int32_t* opt_shape_ptr = opt_shape;
+  PD_ConfigSetTrtDynamicShapeInfo(config, 1, &tensor_name, shapes_num,
+                                  &min_shape_ptr, &max_shape_ptr,
+                                  &opt_shape_ptr, FALSE);
+  PD_ConfigDisableTensorRtOPs(config, 1, &ops_name);
+  PD_ConfigEnableTensorRtOSS(config);
+  bool oss_enabled = PD_ConfigTensorRtOssEnabled(config);
+  EXPECT_TRUE(oss_enabled);
+
+  PD_ConfigEnableTensorRtDla(config, 4);
+  bool dla_enabled = PD_ConfigTensorRtDlaEnabled(config);
+  EXPECT_TRUE(dla_enabled);
+
+  PD_ConfigEnableGpuMultiStream(config);
+  bool thread_local_thread = PD_ConfigThreadLocalStreamEnabled(config);
+  EXPECT_TRUE(thread_local_thread);
+
+  PD_ConfigDisableGpu(config);
+  PD_ConfigDestroy(config);
+}
+
+TEST(PD_Config, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_Config* config = PD_ConfigCreate();
+
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  int num_thread = PD_ConfigGetCpuMathLibraryNumThreads(config);
+  EXPECT_EQ(num_thread, 10);
+
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigSetModelDir(config, model_dir.c_str());
+  PD_ConfigSetOptimCacheDir(config,
+                            (FLAGS_infer_model + "/OptimCacheDir").c_str());
+  const char* model_dir_ = PD_ConfigGetModelDir(config);
+  LOG(INFO) << model_dir_;
+
+  PD_ConfigEnableUseGpu(config, 100, 0);
+  bool use_gpu = PD_ConfigUseGpu(config);
+  EXPECT_TRUE(use_gpu);
+  int device_id = PD_ConfigGpuDeviceId(config);
+  EXPECT_EQ(device_id, 0);
+  int init_size = PD_ConfigMemoryPoolInitSizeMb(config);
+  EXPECT_EQ(init_size, 100);
+
+  float frac = PD_ConfigFractionOfGpuMemoryForPool(config);
+  LOG(INFO) << frac;
+
+  PD_ConfigEnableCudnn(config);
+  bool cudnn = PD_ConfigCudnnEnabled(config);
+  EXPECT_TRUE(cudnn);
+
+  PD_ConfigSwitchIrOptim(config, TRUE);
+  bool ir_optim = PD_ConfigIrOptim(config);
+  EXPECT_TRUE(ir_optim);
+
+  PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_FLOAT32,
+                                FALSE, FALSE);
+  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
+  EXPECT_TRUE(trt_enable);
+  PD_ConfigEnableMemoryOptim(config);
+  bool memory_optim_enable = PD_ConfigMemoryOptimEnabled(config);
+  EXPECT_TRUE(memory_optim_enable);
+  PD_ConfigEnableProfile(config);
+  bool profiler_enable = PD_ConfigProfileEnabled(config);
+  EXPECT_TRUE(profiler_enable);
+  PD_ConfigSetInvalid(config);
+  bool is_valid = PD_ConfigIsValid(config);
+  EXPECT_FALSE(is_valid);
+  PD_ConfigDestroy(config);
+}
+
+TEST(PD_Config, trt_int8) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigEnableUseGpu(config, 100, 0);
+  PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_INT8, FALSE,
+                                TRUE);
+  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
+  EXPECT_TRUE(trt_enable);
+  PD_ConfigDestroy(config);
+}
+
+TEST(PD_Config, trt_fp16) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigEnableUseGpu(config, 100, 0);
+  PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_HALF, FALSE,
+                                FALSE);
+  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
+  EXPECT_TRUE(trt_enable);
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_PredictorDestroy(predictor);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
new file mode 100644
index 00000000000000..d3a15cb285772d
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void predictor_run() {
+  std::string model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigSetModelDir(config, model_dir.c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  LOG(INFO) << "The inputs' size is: " << input_names->size;
+  EXPECT_EQ(input_names->size, 2u);
+
+  int32_t shape_0[4] = {1, 3, 224, 224};
+  float data_0[1 * 3 * 224 * 224] = {0};
+  PD_Tensor* input_0 = PD_PredictorGetInputHandle(predictor, "image");
+  PD_TensorReshape(input_0, 4, shape_0);
+  PD_TensorCopyFromCpuFloat(input_0, data_0);
+  int32_t shape_1[2] = {1, 1};
+  int64_t data_1[1] = {0};
+  PD_Tensor* input_1 = PD_PredictorGetInputHandle(predictor, "label");
+  PD_TensorReshape(input_1, 2, shape_1);
+  PD_TensorCopyFromCpuInt64(input_1, data_1);
+
+  LOG(INFO) << "Run Inference in CAPI encapsulation. ";
+  EXPECT_TRUE(PD_PredictorRun(predictor));
+
+  PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor);
+  LOG(INFO) << "output size is: " << output_names->size;
+  for (size_t index = 0; index < output_names->size; ++index) {
+    LOG(INFO) << "output[" << index
+              << "]'s name is: " << output_names->data[index];
+    PD_Tensor* output =
+        PD_PredictorGetOutputHandle(predictor, output_names->data[index]);
+    PD_OneDimArrayInt32* shape = PD_TensorGetShape(output);
+    LOG(INFO) << "output[" << index << "]'s shape_size is: " << shape->size;
+    int32_t out_size = 1;
+    for (size_t i = 0; i < shape->size; ++i) {
+      LOG(INFO) << "output[" << index << "]'s shape is: " << shape->data[i];
+      out_size = out_size * shape->data[i];
+    }
+    float* out_data = new float[out_size];
+    PD_TensorCopyToCpuFloat(output, out_data);
+    LOG(INFO) << "output[" << index << "]'s DATA is: " << out_data[0];
+    delete[] out_data;
+    PD_OneDimArrayInt32Destroy(shape);
+    PD_TensorDestroy(output);
+  }
+  PD_PredictorClearIntermediateTensor(predictor);
+  PD_PredictorTryShrinkMemory(predictor);
+  PD_OneDimArrayCstrDestroy(output_names);
+  PD_TensorDestroy(input_1);
+  PD_TensorDestroy(input_0);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(PD_PredictorRun, predictor_run) { predictor_run(); }
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc
new file mode 100644
index 00000000000000..4369cd78dfa374
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(PD_PredictorRun, predictor_run) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config *config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/param").c_str());
+  PD_ConfigDisableGpu(config);
+
+  PD_Predictor *predictor = PD_PredictorCreate(config);
+  size_t input_num = PD_PredictorGetInputNum(predictor);
+  LOG(INFO) << "Input num: " << input_num;
+  size_t output_num = PD_PredictorGetOutputNum(predictor);
+  LOG(INFO) << "Output num: " << output_num;
+
+  PD_OneDimArrayCstr *input_names = PD_PredictorGetInputNames(predictor);
+  EXPECT_EQ(input_names->size, 2u);
+  LOG(INFO) << "Predictor start run!";
+  PD_Tensor *inputs[2];
+  inputs[0] = PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  inputs[1] = PD_PredictorGetInputHandle(predictor, input_names->data[1]);
+  LOG(INFO) << "Predictor start run!";
+  // inputs[0]: word, use lod memory in stack
+  int32_t shape_0[2] = {11, 1};
+  int64_t data_0[11 * 1] = {12673, 9763, 905, 284, 45, 7474, 20, 17, 1, 4, 9};
+  size_t lod_layer_0[2] = {0, 11};
+  PD_OneDimArraySize layer_0;
+  layer_0.size = 2;
+  layer_0.data = lod_layer_0;
+  PD_OneDimArraySize *layer_0_ptr = &layer_0;
+  PD_TwoDimArraySize lod_0;
+  lod_0.size = 1;
+  lod_0.data = &layer_0_ptr;
+  PD_TensorReshape(inputs[0], 2, shape_0);
+  PD_TensorCopyFromCpuInt64(inputs[0], data_0);
+  PD_TensorSetLod(inputs[0], &lod_0);
+
+  // inputs[1]: mention, use lod memory in heap
+  int32_t shape_1[2] = {11, 1};
+  int64_t data_1[11 * 1] = {27, 0, 0, 33, 34, 33, 0, 0, 0, 1, 2};
+  PD_TwoDimArraySize *lod_1_ptr = new PD_TwoDimArraySize();
+  lod_1_ptr->size = 1;
+  lod_1_ptr->data = new PD_OneDimArraySize *[1];
+  lod_1_ptr->data[0] = new PD_OneDimArraySize();
+  lod_1_ptr->data[0]->size = 2;
+  lod_1_ptr->data[0]->data = new size_t[2];
+  lod_1_ptr->data[0]->data[0] = 0;
+  lod_1_ptr->data[0]->data[1] = 11;
+
+  PD_TensorReshape(inputs[1], 2, shape_1);
+  PD_TensorCopyFromCpuInt64(inputs[1], data_1);
+  PD_TensorSetLod(inputs[1], lod_1_ptr);
+  // retrieve the lod memory
+  delete[] lod_1_ptr->data[0]->data;
+  delete lod_1_ptr->data[0];
+  delete[] lod_1_ptr->data;
+  delete lod_1_ptr;
+  lod_1_ptr = nullptr;
+
+  LOG(INFO) << "Predictor start run!";
+  bool success = PD_PredictorRun(predictor);
+  EXPECT_TRUE(success);
+  LOG(INFO) << "Predictor run success!";
+  PD_OneDimArrayCstr *output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor *output =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_TwoDimArraySize *output_lod = PD_TensorGetLod(output);
+
+  PD_TwoDimArraySizeDestroy(output_lod);
+  PD_TensorDestroy(output);
+  PD_OneDimArrayCstrDestroy(output_names);
+
+  PD_TensorDestroy(inputs[0]);
+  PD_TensorDestroy(inputs[1]);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
new file mode 100644
index 00000000000000..18107704ae420f
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(PD_Config, interface) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  std::string prog_file = model_dir + "/__model__";
+  std::string param_file = model_dir + "/__params__";
+  std::string opt_cache_dir = FLAGS_infer_model + "/OptimCacheDir";
+
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModelDir(config, model_dir.c_str());
+  std::string model_dir_ = PD_ConfigGetModelDir(config);
+  EXPECT_EQ(model_dir, model_dir_);
+
+  PD_ConfigSetModel(config, prog_file.c_str(), param_file.c_str());
+  PD_ConfigSetProgFile(config, prog_file.c_str());
+  PD_ConfigSetParamsFile(config, param_file.c_str());
+  PD_ConfigSetOptimCacheDir(config, opt_cache_dir.c_str());
+  std::string prog_file_ = PD_ConfigGetProgFile(config);
+  std::string param_file_ = PD_ConfigGetParamsFile(config);
+  EXPECT_EQ(prog_file, prog_file_);
+  EXPECT_EQ(param_file, param_file_);
+
+  PD_ConfigDisableFCPadding(config);
+  bool fc_padding = PD_ConfigUseFcPadding(config);
+  EXPECT_FALSE(fc_padding);
+
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSwitchIrOptim(config, TRUE);
+  bool ir_optim = PD_ConfigIrOptim(config);
+  EXPECT_TRUE(ir_optim);
+
+#ifndef PADDLE_WITH_LITE
+  PD_ConfigEnableLiteEngine(config, PD_PRECISION_FLOAT32, TRUE, 0, nullptr, 0,
+                            nullptr);
+  bool lite_enabled = PD_ConfigLiteEngineEnabled(config);
+  EXPECT_TRUE(lite_enabled);
+#endif
+
+  PD_ConfigSwitchIrDebug(config, TRUE);
+#ifdef PADDLE_WITH_MKLDNN
+  const char* ops_name = "conv_2d";
+  PD_ConfigEnableMKLDNN(config);
+  PD_ConfigSetMkldnnOp(config, 1, &ops_name);
+  PD_ConfigSetMkldnnCacheCapacity(config, 100);
+  bool mkldnn_enabled = PD_ConfigMkldnnEnabled(config);
+  EXPECT_TRUE(mkldnn_enabled);
+
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  int32_t cpu_threads = PD_ConfigGetCpuMathLibraryNumThreads(config);
+  EXPECT_EQ(cpu_threads, 10);
+
+  PD_ConfigEnableMkldnnQuantizer(config);
+  bool mkldnn_qt_enabled = PD_ConfigMkldnnQuantizerEnabled(config);
+  EXPECT_TRUE(mkldnn_qt_enabled);
+
+  PD_ConfigEnableMkldnnBfloat16(config);
+  PD_ConfigSetBfloat16Op(config, 1, &ops_name);
+  bool mkldnn_bf16_enabled = PD_ConfigMkldnnBfloat16Enabled(config);
+  EXPECT_TRUE(mkldnn_bf16_enabled);
+#endif
+
+  PD_ConfigEnableMemoryOptim(config);
+  bool memory_enabled = PD_ConfigMemoryOptimEnabled(config);
+  EXPECT_TRUE(memory_enabled);
+
+  PD_ConfigEnableProfile(config);
+  bool profile_enabled = PD_ConfigProfileEnabled(config);
+  EXPECT_TRUE(profile_enabled);
+
+  PD_ConfigDisableGlogInfo(config);
+  bool glog_diabled = PD_ConfigGlogInfoDisabled(config);
+  EXPECT_TRUE(glog_diabled);
+
+  PD_ConfigSetInvalid(config);
+  bool is_valid = PD_ConfigIsValid(config);
+  EXPECT_FALSE(is_valid);
+
+  PD_ConfigPartiallyRelease(config);
+  PD_ConfigDestroy(config);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc
new file mode 100644
index 00000000000000..f4017fc5a7f340
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc
@@ -0,0 +1,196 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void PD_run() {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+
+  int32_t shapes[4] = {1, 3, 300, 300};
+  std::vector<float> input(1 * 3 * 300 * 300, 0);
+  int32_t size;
+  PD_PlaceType place;
+  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorCopyFromCpuFloat(tensor, input.data());
+  PD_TensorDataFloat(tensor, &place, &size);
+  PD_TensorMutableDataFloat(tensor, place);
+
+  PD_TwoDimArraySize lod;
+  lod.size = 0;
+  lod.data = NULL;
+  PD_TensorSetLod(tensor, &lod);
+
+  PD_PredictorRun(predictor);
+
+  std::vector<float> out_data;
+  PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor* output_tensor =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_OneDimArrayInt32* output_shape = PD_TensorGetShape(output_tensor);
+  int32_t out_num = std::accumulate(output_shape->data,
+                                    output_shape->data + output_shape->size, 1,
+                                    std::multiplies<int32_t>());
+  out_data.resize(out_num);
+  PD_TensorCopyToCpuFloat(output_tensor, out_data.data());
+  LOG(INFO) << "Output tensor name is: " << PD_TensorGetName(output_tensor);
+  PD_DataType data_type = PD_TensorGetDataType(output_tensor);
+  EXPECT_EQ(data_type, PD_DATA_FLOAT32);
+
+  PD_TwoDimArraySize* out_lod = PD_TensorGetLod(output_tensor);
+
+  PD_TwoDimArraySizeDestroy(out_lod);
+  PD_OneDimArrayInt32Destroy(output_shape);
+  PD_TensorDestroy(output_tensor);
+  PD_OneDimArrayCstrDestroy(output_names);
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+TEST(PD_Tensor, PD_run) { PD_run(); }
+
+TEST(PD_Tensor, int32) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  int32_t shapes[4] = {1, 3, 300, 300};
+  std::vector<int32_t> input(1 * 3 * 300 * 300, 0);
+  int32_t size;
+  PD_PlaceType place;
+  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorCopyFromCpuInt32(tensor, input.data());
+  int32_t* data_ptr = PD_TensorDataInt32(tensor, &place, &size);
+  EXPECT_EQ(place, PD_PLACE_CPU);
+  EXPECT_EQ(size, 1 * 3 * 300 * 300);
+  int32_t* mutable_data_ptr = PD_TensorMutableDataInt32(tensor, place);
+  EXPECT_EQ(data_ptr, mutable_data_ptr);
+
+  PD_DataType data_type = PD_TensorGetDataType(tensor);
+  EXPECT_EQ(data_type, PD_DATA_INT32);
+  PD_TensorCopyToCpuInt32(tensor, input.data());
+
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+TEST(PD_Tensor, int64) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  int32_t shapes[4] = {1, 3, 300, 300};
+  std::vector<int64_t> input(1 * 3 * 300 * 300, 0);
+  int32_t size;
+  PD_PlaceType place;
+  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorCopyFromCpuInt64(tensor, input.data());
+  int64_t* data_ptr = PD_TensorDataInt64(tensor, &place, &size);
+  EXPECT_EQ(place, PD_PLACE_CPU);
+  EXPECT_EQ(size, 1 * 3 * 300 * 300);
+  int64_t* mutable_data_ptr = PD_TensorMutableDataInt64(tensor, place);
+  EXPECT_EQ(data_ptr, mutable_data_ptr);
+
+  PD_DataType data_type = PD_TensorGetDataType(tensor);
+  EXPECT_EQ(data_type, PD_DATA_INT64);
+  PD_TensorCopyToCpuInt64(tensor, input.data());
+
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+TEST(PD_Tensor, uint8) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  int32_t shapes[4] = {1, 3, 300, 300};
+  uint8_t input[1 * 3 * 300 * 300] = {0};
+  int32_t size;
+  PD_PlaceType place;
+  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorCopyFromCpuUint8(tensor, input);
+  uint8_t* data_ptr = PD_TensorDataUint8(tensor, &place, &size);
+  EXPECT_EQ(place, PD_PLACE_CPU);
+  EXPECT_EQ(size, 1 * 3 * 300 * 300);
+  uint8_t* mutable_data_ptr = PD_TensorMutableDataUint8(tensor, place);
+  EXPECT_EQ(data_ptr, mutable_data_ptr);
+
+  PD_DataType data_type = PD_TensorGetDataType(tensor);
+  EXPECT_EQ(data_type, PD_DATA_UINT8);
+  PD_TensorCopyToCpuUint8(tensor, input);
+
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+std::string read_file(std::string filename) {
+  std::ifstream file(filename);
+  return std::string((std::istreambuf_iterator<char>(file)),
+                     std::istreambuf_iterator<char>());
+}
+
+TEST(PD_Tensor, from_buffer) {
+  PD_Config* config = PD_ConfigCreate();
+  std::string prog_file = FLAGS_infer_model + "/__model__";
+  std::string params_file = FLAGS_infer_model + "/__params__";
+
+  std::string prog_str = read_file(prog_file);
+  std::string params_str = read_file(params_file);
+
+  PD_ConfigSetModelBuffer(config, prog_str.c_str(), prog_str.size(),
+                          params_str.c_str(), params_str.size());
+
+  bool model_from_memory = PD_ConfigModelFromMemory(config);
+  EXPECT_TRUE(model_from_memory);
+  PD_ConfigDestroy(config);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc
new file mode 100644
index 00000000000000..8951c446b1f83a
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+typedef struct RunParameter {
+  PD_Predictor* predictor;
+  int32_t* shapes;
+  size_t shape_size;
+  float* input_data;
+  int32_t out_size;
+  float* out_data;
+  int32_t thread_index;
+} RunParameter;
+
+void* run(void* thread_param) {
+  struct RunParameter* param = (struct RunParameter*)thread_param;
+  LOG(INFO) << "Thread " << param->thread_index << " start run!";
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(param->predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(param->predictor, input_names->data[0]);
+  PD_TensorReshape(tensor, param->shape_size, param->shapes);
+  PD_TensorCopyFromCpuFloat(tensor, param->input_data);
+  PD_PredictorRun(param->predictor);
+  PD_OneDimArrayCstr* output_names =
+      PD_PredictorGetOutputNames(param->predictor);
+  PD_Tensor* output_tensor =
+      PD_PredictorGetOutputHandle(param->predictor, output_names->data[0]);
+  PD_OneDimArrayInt32* output_shape = PD_TensorGetShape(output_tensor);
+  param->out_size = 1;
+  for (size_t index = 0; index < output_shape->size; ++index) {
+    param->out_size = param->out_size * output_shape->data[index];
+  }
+  PD_OneDimArrayInt32Destroy(output_shape);
+  param->out_data =
+      reinterpret_cast<float*>(malloc(param->out_size * sizeof(float)));
+  PD_TensorCopyToCpuFloat(output_tensor, param->out_data);
+  PD_TensorDestroy(output_tensor);
+  PD_OneDimArrayCstrDestroy(output_names);
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  LOG(INFO) << "Thread " << param->thread_index << " end run!";
+  return NULL;
+}
+void threads_run(int thread_num) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+
+  pthread_t* threads =
+      reinterpret_cast<pthread_t*>(malloc(thread_num * sizeof(pthread_t)));
+  RunParameter* params = reinterpret_cast<RunParameter*>(
+      malloc(thread_num * sizeof(RunParameter)));
+  int32_t shapes[4] = {1, 3, 300, 300};
+  float* input =
+      reinterpret_cast<float*>(malloc(1 * 3 * 300 * 300 * sizeof(float)));
+  memset(input, 0, 1 * 3 * 300 * 300 * sizeof(float));
+  for (int i = 0; i < thread_num; ++i) {
+    params[i].predictor = PD_PredictorClone(predictor);
+    params[i].shapes = shapes;
+    params[i].shape_size = 4;
+    params[i].input_data = input;
+    params[i].out_size = 0;
+    params[i].out_data = NULL;
+    params[i].thread_index = i;
+    pthread_create(&(threads[i]), NULL, run, (params + i));
+  }
+  for (int i = 0; i < thread_num; ++i) {
+    pthread_join(threads[i], NULL);
+  }
+  ASSERT_GT(params[0].out_size, 0);
+
+  for (int i = 1; i < thread_num; ++i) {
+    ASSERT_EQ(params[i].out_size, params[0].out_size);
+    for (int j = 0; j < params[i].out_size; ++j) {
+      ASSERT_EQ(params[i].out_data[j], params[0].out_data[j]);
+    }
+  }
+  for (int i = 0; i < thread_num; ++i) {
+    PD_PredictorDestroy(params[i].predictor);
+    free(params[i].out_data);
+  }
+  free(input);
+  free(params);
+  free(threads);
+  PD_PredictorDestroy(predictor);
+}
+
+TEST(PD_Predictor, PD_multi_threads_run) { threads_run(10); }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc
new file mode 100644
index 00000000000000..11de1a5a6fab4f
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void predictor_run() {
+  std::string model_dir = FLAGS_infer_model;
+  std::string prog_file = model_dir + "/model";
+  std::string params_file = model_dir + "/params";
+  PD_Config *config = PD_ConfigCreate();
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigSetModel(config, prog_file.c_str(), params_file.c_str());
+
+  PD_Predictor *predictor = PD_PredictorCreate(config);
+  PD_Tensor *tensor = PD_PredictorGetInputHandle(predictor, "data");
+
+  const int batch_size = 1;
+  const int channels = 3;
+  const int height = 318;
+  const int width = 318;
+  float *input = new float[batch_size * channels * height * width]();
+
+  int32_t shape[4] = {batch_size, channels, height, width};
+  PD_TensorReshape(tensor, 4, shape);
+  PD_TensorCopyFromCpuFloat(tensor, input);
+  EXPECT_TRUE(PD_PredictorRun(predictor));
+
+  delete[] input;
+  PD_TensorDestroy(tensor);
+  PD_PredictorDestroy(predictor);
+}
+
+TEST(PD_PredictorRun, predictor_run) { predictor_run(); }
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(PD_Config, profile_mkldnn) {
+  std::string model_dir = FLAGS_infer_model;
+  std::string prog_file = model_dir + "/model";
+  std::string params_file = model_dir + "/params";
+  PD_Config *config = PD_ConfigCreate();
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigEnableMKLDNN(config);
+  bool mkldnn_enable = PD_ConfigMkldnnEnabled(config);
+  EXPECT_TRUE(mkldnn_enable);
+  PD_ConfigEnableMkldnnQuantizer(config);
+  bool quantizer_enable = PD_ConfigMkldnnQuantizerEnabled(config);
+  EXPECT_TRUE(quantizer_enable);
+  PD_ConfigEnableMkldnnBfloat16(config);
+  PD_ConfigSetMkldnnCacheCapacity(config, 0);
+  PD_ConfigSetModel(config, prog_file.c_str(), params_file.c_str());
+  PD_ConfigDestroy(config);
+}
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc
new file mode 100644
index 00000000000000..f4fd04e85840de
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+#ifdef PADDLE_WITH_XPU
+TEST(PD_Config, use_xpu) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_Config *config = PD_Config();
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigSetModelDir(config, model_dir.c_str());
+  PD_ConfigSetOptimCacheDir(config,
+                            (FLAGS_infer_model + "/OptimCacheDir").c_str());
+  const char *model_dir_ = PD_ConfigGetModelDir(config);
+  LOG(INFO) << model_dir_;
+  PD_ConfigEnableXpu(config, 0xfffc00);
+  bool use_xpu = PD_ConfigUseXpu(config);
+  EXPECT_TRUE(use_xpu);
+  int32_t device_id = PD_ConfigXpuDeviceId(config);
+  EXPECT_EQ(devive_id, 0);
+  PD_ConfigSwitchIrOptim(config, TRUE);
+  bool ir_optim = PD_IrOptim(config);
+  EXPECT_TRUE(ir_optim);
+  PD_ConfigEnableMemoryOptim(config);
+  bool memory_optim_enable = PD_ConfigMemoryOptimEnabled(config);
+  EXPECT_TRUE(memory_optim_enable);
+  PD_ConfigEnableProfile(config);
+  bool profiler_enable = PD_ConfigProfileEnabled(config);
+  EXPECT_TRUE(profiler_enable);
+  PD_SetInValid(config);
+  bool is_valid = PD_ConfigIsValid(config);
+  EXPECT_FALSE(is_valid);
+  PD_ConfigDestroy(config);
+}
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index c1b12f5c0ecbb6..b1a45afa99d9a5 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -54,6 +54,7 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) {
   size_t avail, total, actual_avail, actual_total;
   bool is_limited = platform::RecordedCudaMemGetInfo(
       &avail, &total, &actual_avail, &actual_total, place_.device);
+  size_t allocated = total - avail;
 
   std::string err_msg;
   if (is_limited) {
@@ -68,13 +69,14 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) {
 
   PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
       "\n\nOut of memory error on GPU %d. "
-      "Cannot allocate %s memory on GPU %d, "
+      "Cannot allocate %s memory on GPU %d, %s memory has been allocated and "
       "available memory is only %s.\n\n"
       "Please check whether there is any other process using GPU %d.\n"
       "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
       "2. If no, please decrease the batch size of your model. %s\n\n",
       place_.device, string::HumanReadableSize(size), place_.device,
-      string::HumanReadableSize(avail), place_.device, err_msg));
+      string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
+      place_.device, err_msg));
 }
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index c733ba5c68c9bd..0d7065d8bfba0e 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -125,6 +125,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
     size_t avail, total, actual_avail, actual_total;
     bool is_limited = platform::RecordedCudaMemGetInfo(
         &avail, &total, &actual_avail, &actual_total, gpu_id_);
+    size_t allocated = total - avail;
 
     std::string err_msg;
     if (is_limited) {
@@ -139,7 +140,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
 
     PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
         "\n\nOut of memory error on GPU %d. "
-        "Cannot allocate %s memory on GPU %d, "
+        "Cannot allocate %s memory on GPU %d, %s memory has been allocated and "
         "available memory is only %s.\n\n"
         "Please check whether there is any other process using GPU %d.\n"
         "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
@@ -150,8 +151,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
         "      The command is "
         "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
         gpu_id_, string::HumanReadableSize(size), gpu_id_,
-        string::HumanReadableSize(avail), gpu_id_,
-        FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+        string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
+        gpu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg));
   }
 }
 
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 1eb0535831bb19..730d49e8acd930 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -207,12 +207,6 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
 
   platform::SetNPUDeviceId(dst_place.device);
 
-  // NOTE(ascendrc): NPU memcpy async from host to device is a "real" async,
-  // which is different from CUDA. In Paddle, when async is called, "sync"
-  // is run actually, which means Paddle doesn't fully supported async.
-  // TODO(ascendrc): Support NPU memcpy async for better performance.
-  stream = nullptr;
-
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << " by thream(" << stream << ")";
 
@@ -220,6 +214,12 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
     platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU");
     platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
   } else {
+    // On NPU, async operation after sync operation is ok, while sync operation
+    // after async is not ok, since the async operation may not done.
+    // So, its needed to do wait before sync operation.
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
+
     platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU");
     platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
   }
@@ -235,12 +235,6 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
 
   platform::SetNPUDeviceId(src_place.device);
 
-  // NOTE(ascendrc): NPU memcpy async from device to host is a "real" async,
-  // which is different from CUDA. In Paddle, when async is called, "sync"
-  // is run actually, which means Paddle doesn't fully supported async.
-  // TODO(ascendrc): Support NPU memcpy async for better performance.
-  stream = nullptr;
-
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place << " by thream(" << stream << ")";
 
@@ -248,6 +242,9 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
     platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU");
     platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
   } else {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::NPUDeviceContext*>(pool.Get(src_place))->Wait();
+
     platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU");
     platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
   }
@@ -270,6 +267,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
       platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
                                stream);
     } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
+
       platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU");
       platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
     }
@@ -284,6 +285,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
       platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
                                stream);
     } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
+
       platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU");
       platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
     }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index cecc70cc6dda8e..6e11c64afc4bd8 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -42,6 +42,10 @@ if (WITH_GPU AND TENSORRT_FOUND)
     add_subdirectory(tensorrt)
 endif()
 
+if (WITH_DLNNE)
+    add_subdirectory(dlnne)
+endif()
+
 if (WITH_LITE)
     add_subdirectory(lite)
 endif()
@@ -167,7 +171,7 @@ endif()
 
 if (WITH_ASCEND_CL)
   cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor)
-  cc_test(lookup_table_v2_op_npu_test SRCS lookup_table_v2_op_npu_test.cc DEPS op_registry lookup_table_v2_op scope device_context enforce executor compare_op)
+  cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_cc_function scope device_context enforce executor compare_op)
 endif()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
@@ -195,3 +199,7 @@ endif()
 if(WITH_ASCEND_CL)
 cc_test(gelu_op_npu_test SRCS gelu_op_npu_test.cc DEPS op_registry gelu_op scope device_context enforce executor)
 endif()
+
+if (WITH_GPU OR WITH_ASCEND_CL)
+cc_test(copy_cross_scope_test SRCS copy_cross_scope_test.cc DEPS op_registry copy_cross_scope_op scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 1cac9ed9f1dd08..055909ba6f486f 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -162,6 +162,12 @@ Sigmoid Activation Operator
 
 )DOC";
 
+UNUSED constexpr char SiluDoc[] = R"DOC(
+Silu Activation Operator
+
+$$out = x * \\frac{1}{1 + e^{-x}}$$
+)DOC";
+
 UNUSED constexpr char LogSigmoidDoc[] = R"DOC(
 Logsigmoid Activation Operator
 
@@ -697,6 +703,7 @@ It is recommended to use the defaults for this activation.
 };
 
 REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc);
+REGISTER_ACTIVATION_OP_MAKER(Silu, SiluDoc);
 REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc);
 REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc);
 REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc);
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 781a97c1ffcc17..836c5fa06f6dfe 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -10,382 +10,719 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/operators/math/math_cuda_utils.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using float16 = paddle::platform::float16;
+template <typename T>
+struct CudaReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // relu(x) = max(x, 0)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] > zero ? args[0] : zero;
+  }
+};
 
 template <typename T>
-struct CudaVecType {
-  using type = T;
-  static constexpr int vecsize = 1;
+struct CudaReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+
+  // dx = dout * (out > 0)
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[1] > zero ? args[0] : zero;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
-template <>
-struct CudaVecType<platform::float16> {
-  using type = __half2;
-  static constexpr int vecsize = 2;
+template <typename T>
+struct CudaLeakyReluFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // leakyrelu(x) = x > 0 ? x : alpha * x
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] > zero ? args[0] : static_cast<T>(alpha) * args[0];
+  }
 };
 
-template <>
-struct CudaVecType<float> {
-  using type = float4;
-  static constexpr int vecsize = 4;
+template <typename T>
+struct CudaLeakyReluGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float alpha;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  // dx = dout * (x > 0 ? 1 : alpha)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[1] > zero ? args[0] : static_cast<T>(alpha) * args[0];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
-class BaseGPUFunctor {
- public:
-  using ELEMENT_TYPE = T;
+struct CudaSigmoidFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // sigmoid(x) = 1 / (1 + exp(-x))
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(one / (one + exp(-x)));
+  }
+};
 
-  using AttrPair = std::vector<std::pair<const char*, float*>>;
+template <typename T>
+struct CudaSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout * out * (1 - out)
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] * args[1] * (one - args[1]);
+  }
 
-  AttrPair GetAttrs() { return AttrPair(); }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
-/* ========================================================================== */
+template <typename T>
+struct CudaSiluFunctor : public BaseActivationFunctor<T> {
+  // MPType means Compute Type
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // silu(x) = x / (1 + exp(-x))
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(x / (one + exp(-x)));
+  }
+};
 
-/* ===========================    relu forward   ============================ */
 template <typename T>
-class ReluGPUFunctor : public BaseGPUFunctor<T> {
- private:
-  T zero_;
+struct CudaSiluGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    MPType temp = one / (one + exp(-x));
+    return static_cast<T>(dout * (temp * (one + x * (one - temp))));
+  }
 
- public:
-  ReluGPUFunctor() { zero_ = static_cast<T>(0.0f); }
-
-  // for relu forward when T is double
-  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
-      const typename CudaVecType<T>::type in) {
-    // relu forward : out = max(x, 0)
-    return in > zero_ ? in : zero_;
-  }
-
-  // when num % vecsize != 0 this func will be used
-  __device__ __forceinline__ T ComputeRemainder(const T in) {
-    // relu forward : out = max(x, 0)
-    return in > zero_ ? in : zero_;
-  }
-};
-
-template <>
-__device__ __forceinline__ CudaVecType<float>::type
-ReluGPUFunctor<float>::Compute(const CudaVecType<float>::type in) {
-  // relu forward : out = max(in, 0)
-  return make_float4((in.x > zero_) * (in.x), (in.y > zero_) * (in.y),
-                     (in.z > zero_) * (in.z), (in.w > zero_) * (in.w));
-}
-
-template <>
-__device__ __forceinline__ CudaVecType<float16>::type
-ReluGPUFunctor<float16>::Compute(const CudaVecType<float16>::type in) {
-// relu forward : out = max(in, 0)
-#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
-  const half2 kzero = __float2half2_rn(0.0f);
-  return __hmul2(__hgt2(in, kzero), in);
-#else
-  const float2 xx = __half22float2(in);
-  return __floats2half2_rn((xx.x > 0.0f) * static_cast<float>(xx.x),
-                           (xx.y > 0.0f) * static_cast<float>(xx.y));
-#endif
-}
-/* ========================================================================== */
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
 
-/* ===========================    relu backward   ============================
- */
+template <typename T>
+struct CudaLogSigmoidFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+
+  // logsigmoid(x) = log(1 / (1 + exp(-x)))
+  // For numerical stability,
+  // logsigmoid(x) =
+  //          - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    MPType temp = x > zero ? zero : -x;
+    return static_cast<T>(-temp - log(exp(-temp) + exp(-x - temp)));
+  }
+};
 
 template <typename T>
-class ReluGradGPUFunctor : public BaseGPUFunctor<T> {
- private:
-  T zero_;
+struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType zero = static_cast<MPType>(0.0f);
+
+  // dx = dout * exp(-x) / (1 + exp(-x))
+  // For numerical stability:
+  // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x,
+  // 0)))
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    MPType temp1 = x > zero ? zero : -x;
+    MPType temp2 = exp(-x - temp1);
+    return static_cast<T>(dout * (temp2 / (exp(-temp1) + temp2)));
+  }
 
- public:
-  ReluGradGPUFunctor() { zero_ = static_cast<T>(0.0f); }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaAtanFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // atan(x) = atan(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(atan(x));
+  }
+};
+
+template <typename T>
+struct CudaAtanGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + x^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] / (one + args[1] * args[1]);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaSoftShrinkFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+
+  // softshrink(x) = x - lambda, if x > lambda;
+  //                 x + lambda, if x < -lambda;
+  //                 0, otherwise.
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[0];
+    T l = static_cast<T>(lambda);
+    T temp1 = static_cast<T>(x > l);
+    T temp2 = static_cast<T>(x < -l);
+    return temp1 * (x - l) + temp2 * (x + l);
+  }
+};
+
+template <typename T>
+struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor<T> {
+  T zero = static_cast<T>(0.0f);
+  float lambda;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+
+  // dx = dout, if x > lambda or x < -lambda else 0
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T x = args[1];
+    T l = static_cast<T>(lambda);
+    return (x >= -l && x <= l) ? zero : args[0];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaCeilFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // ceil(x) = ceil(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(ceil(x));
+  }
+};
+
+template <typename T>
+struct CudaFloorFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // floor(x) = floor(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(floor(x));
+  }
+};
+
+template <typename T>
+struct CudaRoundFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // round(x) = round(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(round(x));
+  }
+};
+
+// grad functor for ceil, floor and round
+template <typename T>
+struct CudaZeroGradFunctor : public BaseActivationFunctor<T> {
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return static_cast<T>(0.0f);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; }
+};
+
+template <typename T>
+struct CudaCosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // cos(x) = cos(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(cos(x));
+  }
+};
+
+template <typename T>
+struct CudaCosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // dx = dout * (-sin(x))
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(-dout * sin(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaSinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // sin(x) = sin(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(sin(x));
+  }
+};
+
+template <typename T>
+struct CudaSinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // dx = dout * cos(x)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout * cos(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaTanFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // tan(x) = tan(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(tan(x));
+  }
+};
+
+template <typename T>
+struct CudaTanGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // dx = dout / cos(x)^2
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout / (cos(x) * cos(x)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaAsinFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // asin(x) = asin(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(asin(x));
+  }
+};
+
+template <typename T>
+struct CudaAsinGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = dout / sqrt(1 - x^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaAcosFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // acos(x) = acos(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(acos(x));
+  }
+};
+
+template <typename T>
+struct CudaAcosGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // dx = -dout / sqrt(1 - x^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(-dout / sqrt(one - x * x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
 
-  // for relu backward when T is double
-  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
-      const typename CudaVecType<T>::type out,
-      const typename CudaVecType<T>::type dout) {
-    return out > zero_ ? dout : zero_;
+template <typename T>
+struct CudaCoshFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // cosh(x) = cosh(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(cosh(x));
+  }
+};
+
+template <typename T>
+struct CudaCoshGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // dx = dout * sinh(x)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout * sinh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaSinhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // sinh(x) = sinh(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(sinh(x));
+  }
+};
+
+template <typename T>
+struct CudaSinhGradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // dx = dout * cosh(x)
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType dout = static_cast<MPType>(args[0]);
+    MPType x = static_cast<MPType>(args[1]);
+    return static_cast<T>(dout * cosh(x));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaTanhFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // tanh(x) = tanh(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(tanh(x));
   }
+};
 
-  // when num % vecsize != 0 this func will be used
-  __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) {
-    // relu backward : dx = out > 0 ? dout : 0
-    return out > zero_ ? dout : zero_;
+template <typename T>
+struct CudaTanhGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout * (1 - out^2)
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T dout = static_cast<T>(args[0]);
+    T out = static_cast<T>(args[1]);
+    return dout * (one - out * out);
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
-template <>
-__device__ __forceinline__ CudaVecType<float>::type
-ReluGradGPUFunctor<float>::Compute(const CudaVecType<float>::type out,
-                                   const CudaVecType<float>::type dout) {
-  // relu backward : dx = out > 0 ? dout : 0;
-  return make_float4((out.x > zero_) * (dout.x), (out.y > zero_) * (dout.y),
-                     (out.z > zero_) * (dout.z), (out.w > zero_) * (dout.w));
-}
-
-template <>
-__device__ __forceinline__ CudaVecType<float16>::type
-ReluGradGPUFunctor<float16>::Compute(const CudaVecType<float16>::type out,
-                                     const CudaVecType<float16>::type dout) {
-// relu backward : dx = out > 0 ? dout : 0;
-#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
-  const half2 kzero = __float2half2_rn(0.0f);
-  return __hmul2(__hgt2(out, kzero), dout);
-#else
-  const float2 xx = __half22float2(out);
-  const float2 yy = __half22float2(dout);
-  return __floats2half2_rn((xx.x > 0.0f) * static_cast<float>(yy.x),
-                           (xx.y > 0.0f) * static_cast<float>(yy.y));
-#endif
-}
+template <typename T>
+struct CudaReciprocalFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // reciprocal(x) = 1 / x
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return one / args[0];
+  }
+};
 
-/* ========================================================================== */
-/* ========================    leaky relu forward    ========================
- */
 template <typename T>
-class LeakyReluGPUFunctor : public BaseGPUFunctor<T> {
- private:
-  T zero_;
-  float alpha_;
+struct CudaReciprocalGradFunctor : public BaseActivationFunctor<T> {
+  // dx = -dout * out^2
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return -args[0] * args[1] * args[1];
+  }
 
- public:
-  LeakyReluGPUFunctor() { zero_ = static_cast<T>(0.0f); }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
 
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha_}};
-  }
-  // leakyrelu forward : out = x > 0 ? x : x * alpha
-  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
-      const typename CudaVecType<T>::type in) {
-    return in > zero_ ? in : static_cast<T>(alpha_) * in;
-  }
-
-  __device__ __forceinline__ T ComputeRemainder(const T in) {
-    // leakyrelu forward : out = x > 0 ? x : x * alpha
-    return in > zero_ ? in : static_cast<T>(alpha_) * in;
-  }
-};
-
-template <>
-__device__ __forceinline__ CudaVecType<float>::type
-LeakyReluGPUFunctor<float>::Compute(const CudaVecType<float>::type in) {
-  // leakyrelu forward : out = x > 0 ? x : x * alpha
-  return make_float4((in.x > zero_) ? (in.x) : (in.x) * alpha_,
-                     (in.y > zero_) ? (in.y) : (in.y) * alpha_,
-                     (in.z > zero_) ? (in.z) : (in.z) * alpha_,
-                     (in.w > zero_) ? (in.w) : (in.w) * alpha_);
-}
-
-template <>
-__device__ __forceinline__ CudaVecType<float16>::type
-LeakyReluGPUFunctor<float16>::Compute(const CudaVecType<float16>::type in) {
-  // leakyrelu forward : out = x > 0 ? x : x * alpha
-  const float2 xx = __half22float2(in);
-  return __floats2half2_rn((xx.x > 0.0f) ? xx.x : xx.x * alpha_,
-                           (xx.y > 0.0f) ? xx.y : xx.y * alpha_);
-}
-/* ========================================================================== */
+template <typename T>
+struct CudaExpFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // exp(x) = exp(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(exp(x));
+  }
+};
 
-/* ===========================  leaky relu backward   =======================
- */
 template <typename T>
-class LeakyReluGradGPUFunctor : public BaseGPUFunctor<T> {
- private:
-  T zero_;
-  float alpha_;
+struct CudaExpGradFunctor : public BaseActivationFunctor<T> {
+  // dx = dout * out
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] * args[1];
+  }
 
- public:
-  LeakyReluGradGPUFunctor() { zero_ = static_cast<T>(0.0f); }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
 
-  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
-    return {{"alpha", &alpha_}};
+template <typename T>
+struct CudaLogFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // log(x) = log(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(log(x));
+  }
+};
+
+template <typename T>
+struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
+  // dx = dout / x
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] / args[1];
   }
 
-  // for leaky relu backward when T is double
-  __device__ __forceinline__ typename CudaVecType<T>::type Compute(
-      const typename CudaVecType<T>::type in,
-      const typename CudaVecType<T>::type dout) {
-    // leakyrelu backward : dx = x > 0 ? dout : alpha * dout
-    return in > zero_ ? dout : static_cast<T>(alpha_) * dout;
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
+template <typename T>
+struct CudaSquareFunctor : public BaseActivationFunctor<T> {
+  // square(x) = x * x
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] * args[0];
   }
+};
 
-  // when num % vecsize != 0 this func will be used
-  __device__ __forceinline__ T ComputeRemainder(const T in, const T dout) {
-    // leakyrelu backward : dx = x > 0 ? dout : alpha * dout
-    return in > zero_ ? dout : static_cast<T>(alpha_) * dout;
+template <typename T>
+struct CudaSquareGradFunctor : public BaseActivationFunctor<T> {
+  T two = static_cast<T>(2.0f);
+
+  // dx = dout * 2 * x
+  // Inputs: args[0], the input dout
+  //         args[1], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] * two * args[1];
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
-template <>
-__device__ __forceinline__ CudaVecType<float>::type
-LeakyReluGradGPUFunctor<float>::Compute(const CudaVecType<float>::type in,
-                                        const CudaVecType<float>::type dout) {
-  // leakyrelu backward : dx = x > 0 ? dout : alpha * dout
-  return make_float4((in.x > zero_) ? (dout.x) : alpha_ * (dout.x),
-                     (in.y > zero_) ? (dout.y) : alpha_ * (dout.y),
-                     (in.z > zero_) ? (dout.z) : alpha_ * (dout.z),
-                     (in.w > zero_) ? (dout.w) : alpha_ * (dout.w));
-}
-
-template <>
-__device__ __forceinline__ CudaVecType<float16>::type LeakyReluGradGPUFunctor<
-    float16>::Compute(const CudaVecType<float16>::type in,
-                      const CudaVecType<float16>::type dout) {
-  // leakyrelu backward : dx = x > 0 ? dout : alpha * dout
-  const float2 xx = __half22float2(in);
-  const float2 yy = __half22float2(dout);
-  return __floats2half2_rn((xx.x > 0.0f) ? yy.x : alpha_ * yy.x,
-                           (xx.y > 0.0f) ? yy.y : alpha_ * yy.y);
-}
+template <typename T>
+struct CudaSqrtFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // sqrt(x) = sqrt(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(sqrt(x));
+  }
+};
 
-/* ========================================================================== */
+template <typename T>
+struct CudaSqrtGradFunctor : public BaseActivationFunctor<T> {
+  T one_half = static_cast<T>(0.5f);
+
+  // dx = dout * 0.5 / out
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return one_half * args[0] / args[1];
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
 
-template <typename T, typename Functor>
-__global__ void ActivationGradKernelVec(const T* forward_data, const T* dout,
-                                        T* dx, int num, Functor functor) {
-  using VecType = typename CudaVecType<T>::type;
-  constexpr int vecsize = CudaVecType<T>::vecsize;
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int stride = blockDim.x * gridDim.x;
-  int loop = num / vecsize;
-  int tail = num % vecsize;
-  const VecType* in_forward = reinterpret_cast<const VecType*>(forward_data);
-  const VecType* in_dout = reinterpret_cast<const VecType*>(dout);
-  VecType* out = reinterpret_cast<VecType*>(dx);
-  VecType forward_vec, dout_vec;
-  T in_data, dout_data;
-  for (int i = idx; i < loop; i += stride) {
-#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
-    forward_vec = __ldg(in_forward + i);
-    dout_vec = __ldg(in_dout + i);
-#else
-    forward_vec = in_forward[i];
-    dout_vec = in_dout[i];
-#endif
-    out[i] = functor.Compute(forward_vec, dout_vec);
-  }
-
-  while (idx == loop && tail) {
-    in_data = forward_data[num - tail];
-    dout_data = dout[num - tail];
-    dx[num - tail] = functor.ComputeRemainder(in_data, dout_data);
-    --tail;
-  }
-}
-
-template <typename T, typename Functor>
-__global__ void ActivationkernelVec(const T* src, T* dst, int num,
-                                    Functor functor) {
-  constexpr int vecsize = CudaVecType<T>::vecsize;
-  using VecType = typename CudaVecType<T>::type;
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  int stride = blockDim.x * gridDim.x;
-  int loop = num / vecsize;
-  int tail = num % vecsize;
-  const VecType* in = reinterpret_cast<const VecType*>(src);
-  VecType* out = reinterpret_cast<VecType*>(dst);
-  VecType x_vec;
-  for (int i = idx; i < loop; i += stride) {
-#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350
-    x_vec = __ldg(in + i);
-#else
-    x_vec = in[i];
-#endif
-    out[i] = functor.Compute(x_vec);
-  }
-
-  while (idx == loop && tail) {
-    dst[num - tail] = functor.ComputeRemainder(src[num - tail]);
-    --tail;
-  }
-}
+template <typename T>
+struct CudaRsqrtFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+
+  // rsqrt(x) = rsqrt(x)
+  // Inputs: args[0], the input x
+  __device__ __forceinline__ T operator()(const T* args) const {
+    MPType x = static_cast<MPType>(args[0]);
+    return static_cast<T>(rsqrt(x));
+  }
+};
+
+template <typename T>
+struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
+  T minus_one_half = static_cast<T>(-0.5f);
+
+  // dx = dout * -0.5 / out^3
+  // Inputs: args[0], the input dout
+  //         args[1], the input out
+  __device__ __forceinline__ T operator()(const T* args) const {
+    T out = args[1];
+    return minus_one_half * args[0] * out * out * out;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
 
 template <typename DeviceContext, typename Functor>
-class ActivationGPUKernel
+class ActivationCudaKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
   using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& context) const override {
-    const framework::Tensor* in_x = nullptr;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::Tensor* x = nullptr;
     framework::Tensor* out = nullptr;
-    ExtractActivationTensor(context, &in_x, &out);
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-
-    int num = in_x->numel();
-    const T* input_data = in_x->data<T>();
-    T* output_data = out->mutable_data<T>(dev_ctx.GetPlace(),
-                                          static_cast<size_t>(num * sizeof(T)));
-
-    int block = 512;
-#ifdef __HIPCC__
-    block = 256;
-#endif
-    Functor functor;
+    ExtractActivationTensor(ctx, &x, &out);
+    out->mutable_data<T>(ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    std::vector<const framework::Tensor*> ins = {x};
+    std::vector<framework::Tensor*> outs = {out};
+    auto functor = Functor();
     auto attrs = functor.GetAttrs();
     for (auto& attr : attrs) {
-      *attr.second = context.Attr<float>(attr.first);
+      *attr.second = ctx.Attr<float>(attr.first);
     }
-    constexpr int vecsize = CudaVecType<T>::vecsize;
-    int grid = max((num / vecsize + block - 1) / block, 1);
-    auto stream = context.cuda_device_context().stream();
-    ActivationkernelVec<T, Functor><<<grid, block, 0, stream>>>(
-        input_data, output_data, num, functor);
+    LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T>(dev_ctx, ins, &outs,
+                                                            functor);
   }
 };
 
 template <typename DeviceContext, typename Functor>
-class ActivationGradGPUKernel
+class ActivationGradCudaKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
   using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
     const framework::Tensor *x, *out, *d_out;
     framework::Tensor* d_x = nullptr;
     x = out = d_out = nullptr;
-    ExtractActivationGradTensor<Functor::FwdDeps()>(context, &x, &out, &d_out,
+    ExtractActivationGradTensor<Functor::FwdDeps()>(ctx, &x, &out, &d_out,
                                                     &d_x);
-    int numel = d_out->numel();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto* dx_data = d_x->mutable_data<T>(
-        dev_ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
-    auto* dout_data = d_out->data<T>();
+    d_x->mutable_data<T>(ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto functor = Functor();
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = ctx.Attr<float>(attr.first);
+    }
+
+    std::vector<const framework::Tensor*> ins = {d_out};
+    std::vector<framework::Tensor*> outs = {d_x};
 
-    auto* forward_data = dout_data;
     if (static_cast<int>(Functor::FwdDeps()) == static_cast<int>(kDepOut)) {
       // Only need forward output Out
-      forward_data = out->data<T>();
+      ins.push_back(out);
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T>(dev_ctx, ins,
+                                                               &outs, functor);
     } else if (static_cast<int>(Functor::FwdDeps()) ==
                static_cast<int>(kDepX)) {
       // Only need forward input X
-      forward_data = x->data<T>();
+      ins.push_back(x);
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T>(dev_ctx, ins,
+                                                               &outs, functor);
+    } else {
+      LaunchElementwiseCudaKernel<ElementwiseType::kUnary, T>(dev_ctx, ins,
+                                                              &outs, functor);
     }
-
-    int block = 512;
-#ifdef __HIPCC__
-    block = 256;
-#endif
-
-    Functor functor;
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = context.Attr<float>(attr.first);
-    }
-    constexpr int vecsize = CudaVecType<T>::vecsize;
-    int grid = max((numel / vecsize + block - 1) / block, 1);
-    auto stream = context.cuda_device_context().stream();
-    ActivationGradKernelVec<T, Functor><<<grid, block, 0, stream>>>(
-        forward_data, dout_data, dx_data, numel, functor);
   }
 };
 
@@ -395,12 +732,13 @@ class ActivationGradGPUKernel
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor,         \
-                                        grad_functor)                       \
+#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, op_name, functor,          \
+                                       grad_functor)                        \
   REGISTER_OP_CUDA_KERNEL(                                                  \
-      act_type,                                                             \
-      ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<float>>,  \
-      ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<double>>, \
+      act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext,  \
+                                      ops::functor<float>>,                 \
+      ops::ActivationKernel<paddle::platform::CUDADeviceContext,            \
+                            ops::functor<double>>,                          \
       ops::ActivationKernel<plat::CUDADeviceContext,                        \
                             ops::functor<plat::float16>>);                  \
   REGISTER_OP_CUDA_KERNEL(                                                  \
@@ -410,28 +748,28 @@ namespace plat = paddle::platform;
                                 ops::grad_functor<double>>,                 \
       ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
                                 ops::grad_functor<plat::float16>>);
-FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL);
 
-#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, op_name, functor,             \
-                                       grad_functor)                           \
+#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor,            \
+                                        grad_functor)                          \
   REGISTER_OP_CUDA_KERNEL(                                                     \
-      act_type, ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,  \
-                                         ops::functor<float>>,                 \
-      ops::ActivationGPUKernel<paddle::platform::CUDADeviceContext,            \
-                               ops::functor<double>>,                          \
-      ops::ActivationGPUKernel<plat::CUDADeviceContext,                        \
-                               ops::functor<plat::float16>>);                  \
+      act_type, ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext, \
+                                          ops::functor<float>>,                \
+      ops::ActivationCudaKernel<paddle::platform::CUDADeviceContext,           \
+                                ops::functor<double>>,                         \
+      ops::ActivationCudaKernel<plat::CUDADeviceContext,                       \
+                                ops::functor<plat::float16>>);                 \
   REGISTER_OP_CUDA_KERNEL(                                                     \
-      act_type##_grad, ops::ActivationGradGPUKernel<plat::CUDADeviceContext,   \
-                                                    ops::grad_functor<float>>, \
-      ops::ActivationGradGPUKernel<plat::CUDADeviceContext,                    \
-                                   ops::grad_functor<double>>,                 \
-      ops::ActivationGradGPUKernel<plat::CUDADeviceContext,                    \
-                                   ops::grad_functor<plat::float16>>);
+      act_type##_grad,                                                         \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<float>>,                 \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<double>>,                \
+      ops::ActivationGradCudaKernel<plat::CUDADeviceContext,                   \
+                                    ops::grad_functor<plat::float16>>);
 
 /* ======================== leaky relu register  ============================ */
-REGISTER_ACTIVATION_GPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluGPUFunctor,
-                               LeakyReluGradGPUFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
+                                CudaLeakyReluGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     leaky_relu_grad_grad,
@@ -444,7 +782,7 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ======================== elu register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(elu, ELU, ELUFunctor, ELUGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(elu, ELU, ELUFunctor, ELUGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     elu_grad_grad, ops::ELUDoubleGradKernel<plat::CUDADeviceContext,
@@ -456,7 +794,8 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================    relu register  ============================ */
-REGISTER_ACTIVATION_GPU_KERNEL(relu, Relu, ReluGPUFunctor, ReluGradGPUFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor,
+                                CudaReluGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     relu_grad_grad,
@@ -469,7 +808,8 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================    tanh register  ============================ */
-REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor,
+                                CudaTanhGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     tanh_grad_grad,
@@ -482,7 +822,8 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 /* ===========================   sqrt register  ============================= */
-REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
+                                CudaSqrtGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     sqrt_grad_grad,
@@ -496,7 +837,8 @@ REGISTER_OP_CUDA_KERNEL(
 
 /* ===========================   rsqrt register  =============================
  */
-REGISTER_ACTIVATION_CUDA_KERNEL(rsqrt, Rsqrt, RsqrtFunctor, RsqrtGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(rsqrt, Rsqrt, CudaRsqrtFunctor,
+                                CudaRsqrtGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     rsqrt_grad_grad,
@@ -510,24 +852,28 @@ REGISTER_OP_CUDA_KERNEL(
 
 /* ===========================  square register  ============================ */
 REGISTER_OP_CUDA_KERNEL(
-    square,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::SquareFunctor<float>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::SquareFunctor<double>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::SquareFunctor<int>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::SquareFunctor<int64_t>>,
-    ops::ActivationKernel<plat::CUDADeviceContext,
-                          ops::SquareFunctor<plat::float16>>);
+    square, ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                                      ops::CudaSquareFunctor<float>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaSquareFunctor<double>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaSquareFunctor<int>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaSquareFunctor<int64_t>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaSquareFunctor<plat::float16>>);
 REGISTER_OP_CUDA_KERNEL(
-    square_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,
-                                           ops::SquareGradFunctor<float>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::SquareGradFunctor<double>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::SquareGradFunctor<int>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::SquareGradFunctor<int64_t>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::SquareGradFunctor<plat::float16>>);
+    square_grad,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaSquareGradFunctor<float>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaSquareGradFunctor<double>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaSquareGradFunctor<int>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaSquareGradFunctor<int64_t>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaSquareGradFunctor<plat::float16>>);
 
 REGISTER_OP_CUDA_KERNEL(
     square_grad_grad,
@@ -564,27 +910,29 @@ REGISTER_OP_CUDA_KERNEL(
 /* ==========================   exp register  ============================ */
 
 REGISTER_OP_CUDA_KERNEL(
-    exp, ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<float>>,
-    ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<double>>,
+    exp, ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                                   ops::CudaExpFunctor<float>>,
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaExpFunctor<double>>,
     ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<int>>,
     ops::ActivationKernel<plat::CUDADeviceContext, ops::ExpFunctor<int64_t>>,
-    ops::ActivationKernel<plat::CUDADeviceContext,
-                          ops::ExpFunctor<plat::float16>>);
+    ops::ActivationCudaKernel<plat::CUDADeviceContext,
+                              ops::CudaExpFunctor<plat::float16>>);
 REGISTER_OP_CUDA_KERNEL(
-    exp_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,
-                                        ops::ExpGradFunctor<float>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::ExpGradFunctor<double>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::ExpGradFunctor<int>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::ExpGradFunctor<int64_t>>,
-    ops::ActivationGradKernel<plat::CUDADeviceContext,
-                              ops::ExpGradFunctor<plat::float16>>);
+    exp_grad, ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                            ops::CudaExpGradFunctor<float>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<double>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<int>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<int64_t>>,
+    ops::ActivationGradCudaKernel<plat::CUDADeviceContext,
+                                  ops::CudaExpGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ==========================  Log register ==================================*/
-REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, LogFunctor, LogGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor);
 
 REGISTER_OP_CUDA_KERNEL(
     log_grad_grad, ops::LogDoubleGradKernel<plat::CUDADeviceContext,
@@ -594,3 +942,57 @@ REGISTER_OP_CUDA_KERNEL(
     ops::LogDoubleGradKernel<plat::CUDADeviceContext,
                              ops::LogGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
+
+REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
+                                CudaSigmoidGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(silu, Silu, CudaSiluFunctor,
+                                CudaSiluGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,
+                                CudaLogSigmoidGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(atan, Atan, CudaAtanFunctor,
+                                CudaAtanGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor,
+                                CudaSoftShrinkGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(ceil, Ceil, CudaCeilFunctor,
+                                CudaZeroGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(floor, Floor, CudaFloorFunctor,
+                                CudaZeroGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(cos, Cos, CudaCosFunctor, CudaCosGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(tan, Tan, CudaTanFunctor, CudaTanGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(acos, Acos, CudaAcosFunctor,
+                                CudaAcosGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(sin, Sin, CudaSinFunctor, CudaSinGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(asin, Asin, CudaAsinFunctor,
+                                CudaAsinGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(sinh, Sinh, CudaSinhFunctor,
+                                CudaSinhGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(cosh, Cosh, CudaCoshFunctor,
+                                CudaCoshGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(round, Round, CudaRoundFunctor,
+                                CudaZeroGradFunctor);
+REGISTER_ACTIVATION_CUDA_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor,
+                                CudaReciprocalGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(log2, Log2, Log2Functor, Log2GradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(log10, Log10, Log10Functor, Log10GradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(brelu, BRelu, BReluFunctor, BReluGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(soft_relu, SoftRelu, SoftReluFunctor,
+                               SoftReluGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(stanh, STanh, STanhFunctor, STanhGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(softplus, Softplus, SoftplusFunctor,
+                               SoftplusGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(softsign, Softsign, SoftsignFunctor,
+                               SoftsignGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(relu6, Relu6, Relu6Functor, Relu6GradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(tanh_shrink, TanhShrink, TanhShrinkFunctor,
+                               TanhShrinkGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(hard_shrink, HardShrink, HardShrinkFunctor,
+                               HardShrinkGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
+                               HardSigmoidGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(swish, Swish, SwishFunctor, SwishGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(thresholded_relu, ThresholdedRelu,
+                               ThresholdedReluFunctor,
+                               ThresholdedReluGradFunctor);
+REGISTER_ACTIVATION_GPU_KERNEL(hard_swish, HardSwish, HardSwishFunctor,
+                               HardSwishGradFunctor);
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index fb9f956f17c0b1..ccd5bf528ba58c 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -258,6 +258,31 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
+// silu(x) = x / (1 + exp(-x))
+template <typename T>
+struct SiluFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+    out.device(d) = x * temp;
+  }
+};
+
+// silu'(x) = (1 / (1 + e^{-x}))  * (1 + out * e^{-x}))
+template <typename T>
+struct SiluGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = static_cast<T>(1) + (-x).exp();  // 1+e^(-x)
+    auto temp2 = x * (-x).exp();                  // x*e^(-x)
+    dx.device(d) = dout * ((static_cast<T>(1) / temp1) *
+                           (static_cast<T>(1) + (temp2 / temp1)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 // Originally: logsigmoid(x) = -log (1 + exp(-x))
 // For numerical stability, we can use the log-sum-exp trick:
 // https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
@@ -430,7 +455,7 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out) const {
     auto temp1 = x < static_cast<T>(threshold * -1.f);
     auto temp2 = x > static_cast<T>(threshold);
-    out.device(d) = x * (temp1 + temp2).template cast<T>();
+    out.device(d) = x * (temp1 || temp2).template cast<T>();
   }
 };
 
@@ -447,7 +472,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     auto temp1 = x < static_cast<T>(threshold * -1.f);
     auto temp2 = x > static_cast<T>(threshold);
-    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
+    dx.device(d) = dout * (temp1 || temp2).template cast<T>();
   }
 
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
@@ -2129,6 +2154,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 
 #define FOR_EACH_ACTIVATION_OP(__macro)                                       \
   __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
+  __macro(silu, Silu, SiluFunctor, SiluGradFunctor);                          \
   __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
   __macro(atan, Atan, AtanFunctor, AtanGradFunctor);                          \
   __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc
index 923b581af287d1..f368c658230555 100644
--- a/paddle/fluid/operators/activation_op_npu.cc
+++ b/paddle/fluid/operators/activation_op_npu.cc
@@ -77,8 +77,7 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
     // 2.1 Get a factor tensor with shape [1].
     Tensor factor_tensor(framework::proto::VarType::FP32);
     factor_tensor.mutable_data<float>({1}, place);
-    TensorFromVector(std::vector<float>{factor}, ctx.device_context(),
-                     &factor_tensor);
+    FillNpuTensorWithConstant<float>(&factor_tensor, factor);
 
     // 2.2 Get the factor which has the shape with x and the same value with
     // factor.
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op.cc b/paddle/fluid/operators/amp/alloc_float_status_op.cc
new file mode 100644
index 00000000000000..181dd6eabe22d7
--- /dev/null
+++ b/paddle/fluid/operators/amp/alloc_float_status_op.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class AllocFloatStatusOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput("FloatStatus"), "Output", "FloatStatus",
+                   "alloc_float_status");
+    ctx->SetOutputDim("FloatStatus", {8});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
+  }
+};
+
+class AllocFloatStatusMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("FloatStatus",
+              "(Tensor) of shape {8} that holds the float status.");
+    AddComment(R"DOC(
+      Produces a float Tensor that holds the float status
+)DOC");
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AllocFloatStatusKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Operator alloc_float_status is not supported on CPU"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(
+    alloc_float_status, ops::AllocFloatStatusOp, ops::AllocFloatStatusMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(alloc_float_status,
+                       ops::AllocFloatStatusKernel<CPU, float>);
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
new file mode 100644
index 00000000000000..fe5b08af52a624
--- /dev/null
+++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class AllocFloatStatusKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* float_status = ctx.Output<framework::Tensor>("FloatStatus");
+    float_status->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    alloc_float_status,
+    ops::AllocFloatStatusKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
index 9d78936ad5f7f2..c7520dbd34f6a9 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
@@ -60,6 +60,12 @@ class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Scale",
              "(Tensor) 1-dim tensor, the scale of check_finite_and_unscale "
              "operator.");
+#ifdef PADDLE_WITH_ASCEND_CL
+    AddInput("FloatStatus",
+             "(Tensor) 1-dim tensor of shape [8], allocated by "
+             "alloc_float_status op")
+        .AsDispensable();
+#endif
     AddOutput("Out",
               "(Tensors) The scaled output tensor of "
               "check_finite_and_unscale operator.")
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
index 2c3a9c366e4fd0..c699486a9140a3 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu
@@ -39,33 +39,36 @@ __global__ void CheckFiniteAndUnscale(const T** xs, const MT* scale,
   __syncthreads();
 
   const int64_t num = s_starts[size];
-  int pre_xs_index = 0;
-  bool t_found_inf = false;
-  const MT t_scale = *scale;
+  int xs_index = 0;
+  bool local_found_inf = false;
+  const MT local_scale = *scale;
   for (int64_t idx = tid; idx < num; idx += gridDim.x * blockDim.x) {
-    // get the xs's index of thread
-    int xs_index = pre_xs_index;
-    while (idx < s_starts[xs_index]) xs_index++;
-    // avoid some tensor's numel is zero
-    while (idx >= s_starts[xs_index]) xs_index++;
-    pre_xs_index = xs_index - 1;
+    // get the "out" index of "id"
+    // For example:
+    // idx = 15, starts = [0, 10, 10, 20, 30]
+    // because 10 <= idx < 20 ==>
+    // the idx element locate in the 3rd tensor (notice the 2nd tensor size is
+    // 0)
+    int next_xs_index = xs_index;
+    while (idx >= s_starts[next_xs_index]) next_xs_index++;
+    xs_index = next_xs_index - 1;
 
     // get in data and out data
-    const T* in = xs[pre_xs_index];
-    T* out = outs[pre_xs_index];
-    int64_t in_idx = idx - s_starts[pre_xs_index];
+    const T* in = xs[xs_index];
+    T* out = outs[xs_index];
+    int64_t in_idx = idx - s_starts[xs_index];
 
     // Unscale
-    MT val = static_cast<MT>(in[in_idx]) * t_scale;
+    MT val = static_cast<MT>(in[in_idx]) * local_scale;
     T narrow_val = static_cast<T>(val);
     out[in_idx] = narrow_val;
 
     // CheckFinite
     if (!isfinite(narrow_val)) {
-      t_found_inf = true;
+      local_found_inf = true;
     }
   }
-  if (t_found_inf) {
+  if (local_found_inf) {
     *found_inf = true;
   }
 }
@@ -94,28 +97,30 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
         scale_data, inverse_scale_v, found_inf_data);
 
     size_t xs_size = xs.size();
+    const auto& cpu_place = platform::CPUPlace();
     // calculate each tensor's start index and copy to device
     auto h_starts_tensor =
-        memory::Alloc(platform::CPUPlace(), (xs_size + 1) * sizeof(int64_t));
+        memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
     int64_t* h_starts = reinterpret_cast<int64_t*>(h_starts_tensor->ptr());
 
     auto d_starts_tensor =
         memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t));
     int64_t* d_starts = reinterpret_cast<int64_t*>(d_starts_tensor->ptr());
 
+    // the start index value of each tensor is
+    // the sum of previous tensor's size. For example:
+    // xs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30]
     h_starts[0] = 0;
     for (int i = 1; i <= xs_size; i++) {
-      // the start index value of each tensor is
-      // the sum of previous tensor's size
       h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel();
     }
     int64_t total_num = h_starts[xs_size];
     memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
-                 d_starts, platform::CPUPlace(), h_starts,
-                 (xs_size + 1) * sizeof(int64_t), dev_ctx.stream());
+                 d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t),
+                 dev_ctx.stream());
 
     // copy each tensor's data address to device
-    auto h_mem = memory::Alloc(platform::CPUPlace(), 2 * xs_size * sizeof(T*));
+    auto h_mem = memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*));
     const T** h_xs = reinterpret_cast<const T**>(h_mem->ptr());
     T** h_outs = reinterpret_cast<T**>(h_mem->ptr()) + xs_size;
 
@@ -128,16 +133,18 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel<T> {
       h_outs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
     }
     memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), d_xs,
-                 platform::CPUPlace(), h_xs, 2 * xs_size * sizeof(T*),
-                 dev_ctx.stream());
+                 cpu_place, h_xs, 2 * xs_size * sizeof(T*), dev_ctx.stream());
 
     // Launch Kernel
-    int block = 1024;
-    int block_num = block * 20;  // each thread deal with 20 number
-    int grid = (total_num + block_num - 1) / block_num;
+    int threads_per_block = std::min(static_cast<int64_t>(1024), total_num);
+    int elements_per_block =
+        threads_per_block * 20;  // each thread deal with 20 number
+    int blocks_per_grid =
+        (total_num + elements_per_block - 1) / elements_per_block;
     VLOG(3) << "launch kernel";
-    CheckFiniteAndUnscale<T, MPDType><<<
-        grid, block, (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>(
+    CheckFiniteAndUnscale<
+        T, MPDType><<<blocks_per_grid, threads_per_block,
+                      (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>(
         d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs);
     VLOG(3) << "finish kernel";
   }
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
index 46f9f7ff089448..8fd45326e4ec61 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -24,12 +24,19 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
+// NOTE(zhiqiu): The CheckFiniteAndUnscaleNPUKernel is different from CUDA.
+// On NPU, we do not really check the data of input tensors,
+// but use NPUGetFloatStatus to check whether the nan/inf occurs on device,
+// and clear it after this op.
+// Which may leads to wrong result if the input tensors is not calculated
+// on NPU device, but got from other way, for example, feeding.
 template <typename T>
 class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     const auto xs = ctx.MultiInput<framework::Tensor>("X");
     const auto* scale = ctx.Input<framework::Tensor>("Scale");
+    const auto* float_status = ctx.Input<framework::Tensor>("FloatStatus");
     auto outs = ctx.MultiOutput<framework::Tensor>("Out");
     auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
 
@@ -44,10 +51,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
     // step1: inverse scale(RealDiv)
     Tensor const_tensor;
     const_tensor.mutable_data<T>({1}, ctx.GetPlace());
-    TensorFromVector(std::vector<T>{static_cast<T>(1.0)}, ctx.device_context(),
-                     &const_tensor);
-
-    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
+    FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
 
     // Inverse(1.0/scale)
     Tensor* tmp_inverse_out = const_cast<Tensor*>(scale);
@@ -59,53 +63,60 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
     runner_inverse.Run(stream);
     tmp_inverse_out = &inverse_out;
 
-    size_t x_size = xs.size();
-    for (size_t i = 0; i < x_size; ++i) {
-      found_inf_data = true;
+    // NOTE(zhiqiu):
+    Tensor tmp;
+    tmp.mutable_data<float>({8}, ctx.GetPlace());
+
+    // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
+    // tmp is only placeholder.
+    auto runner_float_status =
+        NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp},
+                    {{"message", std::string("check_nan_and_inf")}});
+    runner_float_status.Run(stream);
+
+    Tensor sum;
+    sum.mutable_data<float>({1}, ctx.GetPlace());
+    auto runner_reduce_sum =
+        NpuOpRunner("ReduceSumD", {*float_status}, {sum},
+                    {{"axes", std::vector<int>{0}}, {"keep_dims", true}});
+    runner_reduce_sum.Run(stream);
+
+    std::vector<float> sum_vec;
+    TensorToVector(
+        sum, ctx.template device_context<paddle::platform::NPUDeviceContext>(),
+        &sum_vec);
+    found_inf_data = (sum_vec[0] > 1);
+
+    VLOG(4) << "found_inf_data:" << found_inf_data;
+
+    for (size_t i = 0; i < xs.size(); ++i) {
       const auto* x = xs[i];
       auto* out = outs[i];
       out->mutable_data<T>(ctx.GetPlace());
-
-      // step2: CheckNumerics
-      // CheckNumerics runs on the Ascend AI CPU, which delivers poor
-      // performance.
-      Tensor check_xout(x->type());
-      check_xout.Resize(x->dims());
-      check_xout.mutable_data<T>(ctx.GetPlace());
-      try {
-        auto runner_checknumerics =
-            NpuOpRunner("CheckNumerics", {*x}, {check_xout},
-                        {{"message", std::string("check_nan_and_inf")}});
-        runner_checknumerics.Run(stream);
-      } catch (platform::EnforceNotMet& exception) {
-        LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
-        found_inf_data = true;
-      } catch (...) {
-        LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
-        found_inf_data = true;
-      }
-
       if (!found_inf_data) {
         // MatMul
         auto runner_matmul =
             NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
         runner_matmul.Run(stream);
-      } else {
-        // ZerosLike
-        auto runner_zeroslike = NpuOpRunner("ZerosLike", {*x}, {*out}, {});
-        runner_zeroslike.Run(stream);
-      }  // end if
-    }    // end for
+      }
+    }
 
     // set found_inf to true
-    if (found_inf_data) {
-      Tensor found_inf_tensor;
-      found_inf_tensor.Resize({1});
-      bool* is_found_inf =
-          found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
-      *is_found_inf = true;
-      framework::TensorCopySync(found_inf_tensor, ctx.GetPlace(), found_inf);
-    }
+    VLOG(4) << "found overflow:" << found_inf_data;
+    Tensor found_inf_tensor;
+    found_inf_tensor.Resize({1});
+    bool* is_found_inf =
+        found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
+    *is_found_inf = found_inf_data;
+
+    framework::TensorCopy(
+        found_inf_tensor, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), found_inf);
+    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
+
+    auto runner_clear_status =
+        NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
+    runner_clear_status.Run(stream);
   }
 };
 
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
index 99e81a4757d0e0..a80b83f0cbe51f 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -110,22 +110,22 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx) {
   // out found_inf
   Tensor found_inf_tensor;
   found_inf_tensor.Resize({1});
-  bool *is_finite_data =
+  bool *found_inf_data =
       found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
   f::TensorCopy(*found_inf, place, &found_inf_tensor);
-  EXPECT_FALSE(*is_finite_data);
+  EXPECT_TRUE(*found_inf_data);
 
   ctx.Wait();
 }
 
 TEST(check_finite_and_unscale, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
 }
 
 TEST(check_finite_and_unscale, NPU_fp16) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<p::float16>(&scope, ctx);
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<p::float16>(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
index b48b0e78892933..de1f83c1ee50d0 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu
@@ -34,13 +34,39 @@ __global__ void GpuUpdateLossScaling(
 }
 
 template <typename T>
-__global__ void FillIf(T* data, const int64_t num, const T value,
-                       const bool* has_inf) {
-  if (*has_inf) {
-    int tid = threadIdx.x + blockIdx.x * blockDim.x;
-    for (int i = tid; i < num; i += blockDim.x * gridDim.x) {
-      data[i] = value;
-    }
+__global__ void FusedFillIf(T** outs, const size_t xs_size,
+                            const int64_t* starts, const T value,
+                            const bool* has_inf) {
+  if (!(*has_inf)) return;
+
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  // copy starts array from global memory to shared memory
+  extern __shared__ int64_t s_starts[];
+  for (int i = threadIdx.x; i <= xs_size; i += blockDim.x) {
+    s_starts[i] = starts[i];
+  }
+  __syncthreads();
+
+  const int64_t total_num = s_starts[xs_size];
+  int out_index = 0;
+
+  for (int64_t id = tid; id < total_num; id += blockDim.x * gridDim.x) {
+    // get the "out" index of "id"
+    // For example:
+    // id = 15, starts = [0, 10, 10, 20, 30]
+    // because 10 <= id < 20 ==>
+    // the id element locate in the 3rd tensor (notice the 2nd tensor size is 0)
+    int next_out_index = out_index;
+    while (id >= s_starts[next_out_index]) next_out_index++;
+    out_index = next_out_index - 1;
+
+    // get data pointer and index
+    T* out_data = outs[out_index];
+    int64_t idx = id - s_starts[out_index];
+
+    // set value
+    out_data[idx] = value;
   }
 }
 
@@ -68,15 +94,52 @@ class LazyZeros<platform::CUDADeviceContext, T> {
                   const bool* found_inf_data,
                   const std::vector<const framework::Tensor*>& xs,
                   const std::vector<framework::Tensor*>& outs) const {
-    for (size_t i = 0; i < xs.size(); ++i) {
-      auto* out = outs[i];
-      T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
-      int64_t num = out->numel();
-      int block = 1024;
-      int grid = (block - 1 + num) / block;
-      FillIf<<<grid, block, 0, dev_ctx.stream()>>>(
-          out_data, num, static_cast<T>(0), found_inf_data);
+    size_t xs_size = xs.size();
+    const auto& cpu_place = platform::CPUPlace();
+    // alloc each tensor's start index and copy to device
+    auto h_in_starts_mem =
+        memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t));
+    int64_t* h_starts = reinterpret_cast<int64_t*>(h_in_starts_mem->ptr());
+
+    auto d_in_starts_mem =
+        memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t));
+    int64_t* d_starts = reinterpret_cast<int64_t*>(d_in_starts_mem->ptr());
+
+    // the start index value of each tensor is
+    // the sum of previous tensor's size. For example:
+    // outs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30]
+    h_starts[0] = 0;
+    for (int i = 0; i < xs_size; i++) {
+      h_starts[i + 1] = h_starts[i] + outs[i]->numel();
     }
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t),
+                 dev_ctx.stream());
+
+    // copy each tensor of "outs" data address array to device
+    auto h_out_addrs_mem = memory::Alloc(cpu_place, xs_size * sizeof(T*));
+    T** h_out_addrs = reinterpret_cast<T**>(h_out_addrs_mem->ptr());
+
+    auto d_out_addrs_mem = memory::Alloc(dev_ctx, xs_size * sizeof(T*));
+    T** d_out_addrs = reinterpret_cast<T**>(d_out_addrs_mem->ptr());
+
+    for (size_t i = 0; i < xs_size; ++i) {
+      h_out_addrs[i] = outs[i]->mutable_data<T>(dev_ctx.GetPlace());
+    }
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 d_out_addrs, cpu_place, h_out_addrs, xs_size * sizeof(T*),
+                 dev_ctx.stream());
+
+    // launch cuda kernel
+    int64_t total_num = h_starts[xs_size];
+    int64_t threads_per_block = std::min(static_cast<int64_t>(1024), total_num);
+    int64_t elements_per_block =
+        threads_per_block * 50;  // each thread deal with 50 data
+    int64_t blocks_per_grid =
+        (total_num + elements_per_block - 1) / elements_per_block;
+    FusedFillIf<T><<<blocks_per_grid, threads_per_block,
+                     (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>(
+        d_out_addrs, xs_size, d_starts, static_cast<T>(0), found_inf_data);
   }
 };
 
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index dd6dbfd5c0b653..45b28bf61e5d68 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -41,7 +41,7 @@ void Update(const platform::NPUDeviceContext& ctx,
     // bad_out_data = bad_in_data + 1
     Tensor factor_tensor(bad_out_tensor->type());
     factor_tensor.mutable_data<int>({1}, place);
-    TensorFromVector(std::vector<int>{1}, ctx, &factor_tensor);
+    FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
     auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor},
                                  {*bad_out_tensor}, {});
     runner_p2.Run(stream);
@@ -84,7 +84,7 @@ void Update(const platform::NPUDeviceContext& ctx,
     // good_out_data = good_in_data + 1
     Tensor factor_tensor(good_out_tensor->type());
     factor_tensor.mutable_data<int>({1}, place);
-    TensorFromVector(std::vector<int>{1}, ctx, &factor_tensor);
+    FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
     auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor},
                                  {*good_out_tensor}, {});
     runner_p2.Run(stream);
diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc
index add533bafcb0a7..433cabcfee0104 100644
--- a/paddle/fluid/operators/assign_op.cc
+++ b/paddle/fluid/operators/assign_op.cc
@@ -162,6 +162,7 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double,
                                ops::AssignKernel, int, ops::AssignKernel,
                                int64_t, ops::AssignKernel, bool,
                                ops::AssignKernel, plat::float16,
+                               ops::AssignKernel, plat::bfloat16,
                                ops::AssignKernel);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
index 5cf1303a229a90..792d01a5efe430 100644
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -75,6 +75,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
 
 TEST(assign, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx, "assign");
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "assign");
 }
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index fc31885824b55f..edad20435b41c9 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -575,7 +575,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
     // SavedVariance have been reverted in forward operator
     const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
     const bool is_test = ctx.Attr<bool>("is_test");
     const float epsilon = ctx.Attr<float>("epsilon");
     const DataLayout data_layout =
@@ -585,6 +585,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
     auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
     auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
 
+    use_global_stats = is_test || use_global_stats;
+
     // batch_norm with inplace as false will take X as grad input, which
     // is same as cuDNN batch_norm backward calculation, batch_norm
     // with inplace as true only take Y as input and X should be calculate
@@ -605,13 +607,6 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
                             "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
     }
 
-    PADDLE_ENFORCE_EQ(
-        is_test, false,
-        platform::errors::InvalidArgument(
-            "`is_test = True` CANNOT be used in train program. If "
-            "you want to use global status in pre_train model, "
-            "please set `use_global_stats = True`"));
-
     // Get the size for each dimension.
     // NCHW [batch_size, in_channels, in_height, in_width]
     const auto &x_dims = x->dims();
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index 444c24b826b1b8..6fc78732b1063a 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -41,6 +41,83 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
+template <typename T, framework::DataLayout layout>
+static __global__ void BNForwardInference(
+    const T *x, const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *variance, const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
+    const double epsilon, T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int num = N * C * HxW;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> x_sub_mean =
+        static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
+    BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
+    y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
+  }
+}
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
+    const T *x, const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
+    const double epsilon, double exponentialAverageFactor, T *y,
+    BatchNormParamType<T> *mean, BatchNormParamType<T> *variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> variance_val;
+  __shared__ BatchNormParamType<T> inv_var_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
+      x_sum += x_i;
+      x_square_sum += x_i * x_i;
+    }
+    x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+    x_square_sum =
+        BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      mean_val = x_sum / inner_size;
+      variance_val = x_square_sum / inner_size - mean_val * mean_val;
+      inv_var_val = 1 / sqrt(variance_val + epsilon);
+
+      if (save_mean && save_inv_variance) {
+        save_mean[i] = mean_val;
+        save_inv_variance[i] = inv_var_val;
+      }
+      mean[i] = (1 - exponentialAverageFactor) * mean_val +
+                exponentialAverageFactor * mean[i];
+      variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                    exponentialAverageFactor * variance[i];
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_sub_mean =
+          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
+      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
+    }
+  }
+}
+
 template <typename T>
 class BatchNormKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -80,8 +157,12 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     auto dtype = platform::CudnnDataType<T>::type;
 
 #ifdef PADDLE_WITH_HIP
-    // HIP do not support compute format of NHWC
-    auto compute_format = DataLayout::kNCHW;
+    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
+                                                           : DataLayout::kNCHW;
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
 #else
     const bool fast_nhwc_batch_norm =
         test_mode ||
@@ -111,14 +192,15 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 
 // ------------------- cudnn descriptors ---------------------
 #ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t data_desc_;
-    miopenTensorDescriptor_t bn_param_desc_;
-    miopenBatchNormMode_t mode_;
-
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
 #else
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t bn_param_desc_;
@@ -138,7 +220,8 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
 #ifdef PADDLE_WITH_HIP
-    mode_ = miopenBNSpatial;
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
 #elif CUDNN_VERSION_MIN(7, 0, 1)
     if (FLAGS_cudnn_batchnorm_spatial_persistent) {
       mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
@@ -161,14 +244,15 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     }
 
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-        const_cast<int *>(strides.data())));
-    // Note: PERSISTENT not implemented for inference
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenDeriveBNTensorDescriptor(
-            bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// Note: PERSISTENT not implemented for inference
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(
+//         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
 #else
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
@@ -226,28 +310,53 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
               C, est_var->dims()[0], est_var->dims()));
 
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenBatchNormalizationForwardInference(
-              handle, miopenBNSpatial,
-              const_cast<void *>(
-                  static_cast<const void *>(CudnnDataType<T>::kOne())),
-              const_cast<void *>(
-                  static_cast<const void *>(CudnnDataType<T>::kZero())),
-              data_desc_,
-              static_cast<const void *>(transformed_x.template data<T>()),
-              data_desc_,
-              static_cast<void *>(
-                  transformed_y.template mutable_data<T>(ctx.GetPlace())),
-              bn_param_desc_,
-              const_cast<void *>(static_cast<const void *>(
-                  scale->template data<BatchNormParamType<T>>())),
-              const_cast<void *>(static_cast<const void *>(
-                  bias->template data<BatchNormParamType<T>>())),
-              const_cast<void *>(static_cast<const void *>(
-                  est_mean->template data<BatchNormParamType<T>>())),
-              const_cast<void *>(static_cast<const void *>(
-                  est_var->template data<BatchNormParamType<T>>())),
-              epsilon));
+      const int block_size = 256;
+      const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
+      if (compute_format == DataLayout::kNCHW) {
+        BNForwardInference<
+            T,
+            DataLayout::kNCHW><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+            transformed_x.template data<T>(),
+            est_mean->template data<BatchNormParamType<T>>(),
+            est_var->template data<BatchNormParamType<T>>(),
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
+            epsilon, transformed_y.template data<T>());
+      } else {
+        BNForwardInference<
+            T,
+            DataLayout::kNHWC><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+            transformed_x.template data<T>(),
+            est_mean->template data<BatchNormParamType<T>>(),
+            est_var->template data<BatchNormParamType<T>>(),
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
+            epsilon, transformed_y.template data<T>());
+      }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardInference(
+//         handle, miopenBNSpatial,
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_mean->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_var->template data<BatchNormParamType<T>>())),
+//         epsilon));
 #else
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnBatchNormalizationForwardInference(
@@ -365,34 +474,66 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 #endif  // CUDNN_VERSION_MIN(7, 4, 1)
         if (!called) {
 #ifdef PADDLE_WITH_HIP
-          PADDLE_ENFORCE_CUDA_SUCCESS(
-              platform::dynload::miopenBatchNormalizationForwardTraining(
-                  handle, mode_, const_cast<void *>(static_cast<const void *>(
-                                     CudnnDataType<T>::kOne())),
-                  const_cast<void *>(
-                      static_cast<const void *>(CudnnDataType<T>::kZero())),
-                  data_desc_,
-                  static_cast<const void *>(transformed_x.template data<T>()),
-                  data_desc_,
-                  static_cast<void *>(
-                      transformed_y.template mutable_data<T>(ctx.GetPlace())),
-                  bn_param_desc_,
-                  const_cast<void *>(static_cast<const void *>(
-                      scale->template data<BatchNormParamType<T>>())),
-                  const_cast<void *>(static_cast<const void *>(
-                      bias->template data<BatchNormParamType<T>>())),
-                  this_factor,
-                  static_cast<void *>(
-                      mean_out->template mutable_data<BatchNormParamType<T>>(
-                          ctx.GetPlace())),
-                  static_cast<void *>(variance_out->template mutable_data<
-                                      BatchNormParamType<T>>(ctx.GetPlace())),
-                  epsilon,
-                  static_cast<void *>(
-                      saved_mean->template mutable_data<BatchNormParamType<T>>(
-                          ctx.GetPlace())),
-                  static_cast<void *>(saved_variance->template mutable_data<
-                                      BatchNormParamType<T>>(ctx.GetPlace()))));
+          const int num = transformed_x.numel();
+          const int block = 256;
+          const int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+          const int max_blocks = std::max(max_threads / block, 1);
+          const int grid = std::min(C, max_blocks);
+          if (compute_format == DataLayout::kNCHW) {
+            BNForwardTraining<
+                T, block,
+                DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+                transformed_x.template data<T>(),
+                scale->template data<BatchNormParamType<T>>(),
+                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
+                epsilon, this_factor, transformed_y.template data<T>(),
+                mean_out->template data<BatchNormParamType<T>>(),
+                variance_out->template data<BatchNormParamType<T>>(),
+                saved_mean->template data<BatchNormParamType<T>>(),
+                saved_variance->template data<BatchNormParamType<T>>());
+          } else {
+            BNForwardTraining<
+                T, block,
+                DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+                transformed_x.template data<T>(),
+                scale->template data<BatchNormParamType<T>>(),
+                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
+                epsilon, this_factor, transformed_y.template data<T>(),
+                mean_out->template data<BatchNormParamType<T>>(),
+                variance_out->template data<BatchNormParamType<T>>(),
+                saved_mean->template data<BatchNormParamType<T>>(),
+                saved_variance->template data<BatchNormParamType<T>>());
+          }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardTraining(
+//         handle, mode_, const_cast<void *>(static_cast<const void *>(
+//                            CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         this_factor,
+//         static_cast<void *>(
+//             mean_out->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(variance_out->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace())),
+//         epsilon,
+//         static_cast<void *>(
+//             saved_mean->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(saved_variance->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace()))));
 #else
           PADDLE_ENFORCE_CUDA_SUCCESS(
               platform::dynload::cudnnBatchNormalizationForwardTraining(
@@ -423,11 +564,12 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
           ctx, &transformed_y, y);
     }
 #ifdef PADDLE_WITH_HIP
-    // clean when exit.
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
 #else
     // clean when exit.
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -439,7 +581,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 };
 
 template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ void KeBNBackwardScaleBias(
+static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
     const T *dy, const T *x, const BatchNormParamType<T> *mean,
     const BatchNormParamType<T> *variance, const double epsilon, const int N,
     const int C, const int HxW, BatchNormParamType<T> *dscale,
@@ -526,13 +668,97 @@ class InplaceHelper {
 };
 
 template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ void BNBackwardData(const T *dy,
-                                      const BatchNormParamType<T> *scale,
-                                      const BatchNormParamType<T> *mean,
-                                      const T *x,
-                                      const BatchNormParamType<T> *variance,
-                                      const int C, const int N, const int HxW,
-                                      T *dx) {
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
+    const T *dy, const T *x, const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *saved_mean,
+    const BatchNormParamType<T> *saved_inv_variance, const int C, const int N,
+    const int HxW, const double epsilon, T *dx, BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> inv_var_val;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> dscale_val;
+  __shared__ BatchNormParamType<T> dbias_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+
+    if (saved_mean && saved_inv_variance) {
+      if (threadIdx.x == 0) {
+        inv_var_val = saved_inv_variance[i];
+        mean_val = saved_mean[i];
+      }
+    } else {
+      BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+      BatchNormParamType<T> x_square_sum =
+          static_cast<BatchNormParamType<T>>(0);
+
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index = layout == framework::DataLayout::kNCHW
+                              ? (j / HxW * C + i) * HxW + j % HxW
+                              : j * outer_size + i;
+        BatchNormParamType<T> x_i =
+            static_cast<BatchNormParamType<T>>(x[index]);
+        x_sum += x_i;
+        x_square_sum += x_i * x_i;
+      }
+      x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+      x_square_sum =
+          BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+      if (threadIdx.x == 0) {
+        mean_val = x_sum / inner_size;
+        inv_var_val =
+            1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
+      }
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> dy_i =
+          static_cast<BatchNormParamType<T>>(dy[index]);
+      ds_sum +=
+          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
+      db_sum += dy_i;
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale_val = ds_sum * inv_var_val;
+      dbias_val = db_sum;
+      dscale[i] = dscale_val;
+      dbias[i] = dbias_val;
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      dx[index] = scale[i] * inv_var_val *
+                  (static_cast<BatchNormParamType<T>>(dy[index]) -
+                   dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
+                   (static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
+                       inv_var_val * dscale_val / inner_size);
+    }
+  }
+}
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
+    const T *dy, const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *mean, const T *x,
+    const BatchNormParamType<T> *variance, const int C, const int N,
+    const int HxW, T *dx) {
   const int outer_size = C;
   const int inner_size = N * HxW;
   typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
@@ -567,7 +793,6 @@ static __global__ void BNBackwardData(const T *dy,
       dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
     }
     __syncthreads();
-
     for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
       const int index = layout == framework::DataLayout::kNCHW
                             ? (j / HxW * C + i) * HxW + j % HxW
@@ -592,7 +817,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
         platform::errors::InvalidArgument("It must use CUDAPlace."));
     double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
     const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
 
     const DataLayout data_layout =
         framework::StringToDataLayout(data_layout_str);
@@ -625,12 +850,7 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     }
 
     const bool is_test = ctx.Attr<bool>("is_test");
-    PADDLE_ENFORCE_EQ(
-        is_test, false,
-        platform::errors::InvalidArgument(
-            "`is_test = True` CANNOT be used in train program. If "
-            "you want to use global status in pre_train model, "
-            "please set `use_global_stats = True`"));
+    use_global_stats = is_test || use_global_stats;
 
     const auto &x_dims = x->dims();
 
@@ -668,8 +888,12 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     auto dtype = platform::CudnnDataType<T>::type;
     const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
 #ifdef PADDLE_WITH_HIP
-    // HIP do not support compute format of NHWC
-    auto compute_format = DataLayout::kNCHW;
+    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
+                                                           : DataLayout::kNCHW;
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
 #else
     const bool fast_nhwc_batch_norm =
         dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent &&
@@ -714,7 +938,11 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     const int num = transformed_x.numel();
+#ifdef HIPCC
+    const int block = 256;
+#else
     const int block = 512;
+#endif
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid1 = (num + block - 1) / block;
@@ -734,14 +962,15 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 
 // ------------------- cudnn descriptors ---------------------
 #ifdef PADDLE_WITH_HIP
-      miopenTensorDescriptor_t data_desc_;
-      miopenTensorDescriptor_t bn_param_desc_;
-      miopenBatchNormMode_t mode_;
-
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
 #else
       cudnnTensorDescriptor_t data_desc_;
       cudnnTensorDescriptor_t bn_param_desc_;
@@ -759,7 +988,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
       }
       epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 #ifdef PADDLE_WITH_HIP
-      mode_ = miopenBNSpatial;
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
 #elif CUDNN_VERSION_MIN(7, 0, 1)
       if (FLAGS_cudnn_batchnorm_spatial_persistent) {
         mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
@@ -771,13 +1001,14 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 #endif  // CUDNN_VERSION_MIN(7, 0, 1)
 
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-          data_desc_, CudnnDataType<T>::type,
-          x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-          const_cast<int *>(strides.data())));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
-                                                            data_desc_, mode_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
+//                                                       data_desc_, mode_));
 #else
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           data_desc_, CudnnDataType<T>::type,
@@ -871,20 +1102,49 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 #endif  // CUDNN_VERSION_MIN(7, 4, 1)
         if (!called) {
 #ifdef PADDLE_WITH_HIP
-          PADDLE_ENFORCE_CUDA_SUCCESS(
-              platform::dynload::miopenBatchNormalizationBackward(
-                  dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), data_desc_,
-                  transformed_x.template data<T>(), data_desc_,
-                  transformed_d_y.template data<T>(), data_desc_,
-                  transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
-                  bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-                  d_scale->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  d_bias->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  epsilon, saved_mean_data, saved_var_data));
+          if (compute_format == DataLayout::kNCHW) {
+            BNBackward<
+                T, block,
+                DataLayout::kNCHW><<<grid2, block, 0, dev_ctx.stream()>>>(
+                transformed_d_y.template data<T>(),
+                transformed_x.template data<T>(),
+                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
+                saved_var_data, C, N, H * W * D, epsilon,
+                transformed_d_x.template data<T>(),
+                d_scale->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                d_bias->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()));
+          } else {
+            BNBackward<
+                T, block,
+                DataLayout::kNHWC><<<grid2, block, 0, dev_ctx.stream()>>>(
+                transformed_d_y.template data<T>(),
+                transformed_x.template data<T>(),
+                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
+                saved_var_data, C, N, H * W * D, epsilon,
+                transformed_d_x.template data<T>(),
+                d_scale->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                d_bias->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()));
+          }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationBackward(
+//         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), data_desc_,
+//         transformed_x.template data<T>(), data_desc_,
+//         transformed_d_y.template data<T>(), data_desc_,
+//         transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
+//         bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
+//         d_scale->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         d_bias->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         epsilon, saved_mean_data, saved_var_data));
 #else
           PADDLE_ENFORCE_CUDA_SUCCESS(
               platform::dynload::cudnnBatchNormalizationBackward(
@@ -931,11 +1191,12 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
       }
 
 #ifdef PADDLE_WITH_HIP
-      // clean when exit.
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
 #else
       // clean when exit.
       PADDLE_ENFORCE_CUDA_SUCCESS(
diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc
index 20b33c4e4e05a6..0de0f5e4505795 100644
--- a/paddle/fluid/operators/cast_op_npu.cc
+++ b/paddle/fluid/operators/cast_op_npu.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_ASCEND_CL
 #include <memory>
 #include <string>
 
@@ -41,11 +40,20 @@ class CastNPUKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<Tensor>("X");
     int dtype = ctx.Attr<int>("out_dtype");
-
     auto* out = ctx.Output<Tensor>("Out");
-
     auto place = ctx.GetPlace();
 
+    if (x->type() == dtype) {
+      // NOTE(zhiqiu): NPU cast op may result in wrong value, so
+      // add special case here.
+      VLOG(4) << "cast to same dtype:" << dtype;
+      out->mutable_data(place, x->type());
+      framework::TensorCopy(
+          *x, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), out);
+      return;
+    }
+
     auto iter = DTYPE_2_ACL_DTYPE.find(
         static_cast<framework::proto::VarType::Type>(dtype));
     int aclDtype = iter->second;
@@ -76,7 +84,7 @@ class CastNPUKernel : public framework::OpKernel<T> {
   }
 };
 }  // namespace operators
-}  // namespace paddleaclDtype
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 
@@ -84,9 +92,9 @@ REGISTER_OP_NPU_KERNEL(
     cast, ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int16_t>,
     ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int32_t>,
     ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int>,
     ops::CastNPUKernel<paddle::platform::NPUDeviceContext, bool>,
     ops::CastNPUKernel<paddle::platform::NPUDeviceContext, double>,
     ops::CastNPUKernel<paddle::platform::NPUDeviceContext, float>,
     ops::CastNPUKernel<paddle::platform::NPUDeviceContext,
                        paddle::platform::float16>);
-#endif
diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc
index eb27df8a36757c..7176a0466bb831 100644
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -145,10 +145,14 @@ REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad, ops::ClipGradInplaceInferer,
                   ops::ClipDoubleGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(
     clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ClipKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::ClipKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ClipKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ClipKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ClipGradKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::ClipGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ClipGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ClipGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
 
 REGISTER_OP_VERSION(clip)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/clip_op.cu b/paddle/fluid/operators/clip_op.cu
index d31b81c13c5cf6..fd61e4ea61d4ff 100644
--- a/paddle/fluid/operators/clip_op.cu
+++ b/paddle/fluid/operators/clip_op.cu
@@ -17,8 +17,12 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     clip, ops::ClipKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ClipKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::ClipKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ClipKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ClipKernel<paddle::platform::CUDADeviceContext, int64_t>);
 
 REGISTER_OP_CUDA_KERNEL(
     clip_grad, ops::ClipGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ClipGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::ClipGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ClipGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ClipGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 977a208d20e783..3f210219608fb7 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -11,7 +11,7 @@ foreach(src ${OPS})
     set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS})
 endforeach()
 
-register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
+register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op c_gen_hccl_id_op gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
 
 if(WITH_NCCL OR WITH_RCCL)
     set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
@@ -19,12 +19,6 @@ if(WITH_NCCL OR WITH_RCCL)
     op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
-if(WITH_ASCEND)
-    op_library(gen_nccl_id_op)
-    op_library(c_gen_nccl_id_op)
-endif()
-
-
 if(WITH_GLOO)
     set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
 endif()
@@ -35,5 +29,38 @@ if(WITH_XPU_BKCL)
     op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
+if(WITH_ASCEND_CL)
+    cc_library(gen_hccl_id_op_helper SRCS gen_hccl_id_op_helper.cc DEPS dynload_warpctc dynamic_loader scope)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper gen_hccl_id_op_helper)
+    op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
+    op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
+endif()
+
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
 set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
+
+if(WITH_ASCEND_CL)
+    set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hccl_op c_gen_hccl_id_op gen_hccl_id_op_helper
+        gen_hccl_id_op op_registry ascend_hccl flags
+        dynamic_loader dynload_warpctc scope device_context enforce executor)
+    cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc
+        DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc
+        DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc
+        DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc
+        DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc
+            DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
+        DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc
+        DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
+        DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
+        DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
+    cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
+        DEPS op_registry elementwise_add_op c_sync_calc_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/collective/alltoall_op.cc b/paddle/fluid/operators/collective/alltoall_op.cc
new file mode 100644
index 00000000000000..1c57b9f9967633
--- /dev/null
+++ b/paddle/fluid/operators/collective/alltoall_op.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/alltoall_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AllToAllOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "AllToAll");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "AllToAll");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for alltoall op must be non-negative.", ring_id));
+    framework::DDim dim = ctx->GetInputDim("X");
+    if (dim[0] < 0) dim[0] = -1;
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class AllToAllOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor send.");
+    AddOutput("Out", "(Tensor) the result of alltoall.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+AllToAll Operator
+Scatter tensors from all participators to all participators.
+)DOC");
+  }
+};
+
+template <typename T>
+class AllToAllOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("alltoall");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetOutput("Out", this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(AllToAllInplaceInferer, {"X", "Out"});
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(alltoall, ops::AllToAllOp, ops::AllToAllOpMaker,
+                  ops::AllToAllOpGradMaker<paddle::framework::OpDesc>,
+                  ops::AllToAllOpGradMaker<paddle::imperative::OpBase>,
+                  ops::AllToAllInplaceInferer)
+
+REGISTER_OP_CPU_KERNEL(alltoall, ops::AllToAllOpCPUKernel<float>,
+                       ops::AllToAllOpCPUKernel<double>,
+                       ops::AllToAllOpCPUKernel<int>,
+                       ops::AllToAllOpCPUKernel<int64_t>,
+                       ops::AllToAllOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc
new file mode 100644
index 00000000000000..1bcb47fc686cfe
--- /dev/null
+++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/alltoall_op.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL)
+#if NCCL_VERSION_CODE >= 2703
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    int send_numel = x->numel();
+    ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for alltoall op must be non-negative.", ring_id));
+    auto place = ctx.GetPlace();
+    auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+    int nranks = comm->nranks();
+
+    cudaStream_t stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    framework::DDim x_dims = x->dims();
+    framework::DDim out_dims(x_dims);
+    PADDLE_ENFORCE_EQ(
+        x_dims[0] % nranks, 0,
+        platform::errors::InvalidArgument(
+            "The first dimension size (%d) of the input tensor must be "
+            "divisible by the number of ranks (%d).",
+            x_dims[0], nranks));
+    auto send_buf = x->data<T>();
+    auto recv_buf = out->mutable_data<T>(out_dims, place);
+    size_t offset = 0;
+    send_numel /= nranks;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart());
+    for (auto i = 0; i < nranks; ++i) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend(
+          send_buf + offset, send_numel, dtype, i, comm->comm(), stream));
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclRecv(
+          recv_buf + offset, send_numel, dtype, i, comm->comm(), stream));
+      offset += send_numel;
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd());
+#else
+    PADDLE_THROW(
+        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+#endif
+#else
+    PADDLE_THROW(
+        platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(alltoall, ops::AllToAllOpCUDAKernel<float>,
+                        ops::AllToAllOpCUDAKernel<double>,
+                        ops::AllToAllOpCUDAKernel<int>,
+                        ops::AllToAllOpCUDAKernel<int64_t>,
+                        ops::AllToAllOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/alltoall_op.h b/paddle/fluid/operators/collective/alltoall_op.h
new file mode 100644
index 00000000000000..61eec44093794c
--- /dev/null
+++ b/paddle/fluid/operators/collective/alltoall_op.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+#if defined(PADDLE_WITH_GLOO)
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class AllToAllOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support alltoall for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc
index 4111a19c5ebc8c..c4e779698cccaf 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cc
@@ -42,6 +42,10 @@ class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) the allgather result");
     AddAttr<int>("ring_id", "(int default 0) communication ring id.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for all gather.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
new file mode 100644
index 00000000000000..e7f05549d9efea
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+
+#include <memory>
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    auto place = ctx.GetPlace();
+    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+    int nranks = comm->nranks();
+
+    framework::DDim out_dims = in->dims();
+    out_dims[0] *= nranks;
+    out->mutable_data<T>(out_dims, place);
+
+    uint64_t send_numel = in->numel();
+    void *send_buff = reinterpret_cast<void *>(const_cast<T *>(in->data<T>()));
+    void *recv_buff = reinterpret_cast<void *>(out->data<T>());
+
+    aclrtStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::NPUDeviceContext *>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    VLOG(3) << "begin hccl allgather, parameter is: "
+            << ", group is " << group << ", ring_id is " << ring_id
+            << ", nranks is " << nranks;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllGather(
+        send_buff, recv_buff, send_numel, dtype, comm->comm(),
+        reinterpret_cast<void *>(stream)));
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_allgather, ops::CAllGatherOpASCENDKernel<int8_t>,
+                       ops::CAllGatherOpASCENDKernel<int>,
+                       ops::CAllGatherOpASCENDKernel<float>,
+                       ops::CAllGatherOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
new file mode 100644
index 00000000000000..4c7dfc4aad7d0e
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -0,0 +1,192 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allgather);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_allgather, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+
+  int num1 = 1;
+  int num2 = 4;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+  attrs["nranks"] = 2;
+
+  auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size() * 2);
+  for (uint32_t i = 0; i < out_vec.size() / 2; i++) {
+    EXPECT_EQ(out_vec[i], 1.0);
+  }
+  for (uint32_t i = out_vec.size() / 2; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 2.0);
+  }
+}
+
+TEST(c_allgather, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLAllGatherOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
index 835b49e57bc092..8bdbdfac8ffd1d 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
@@ -37,14 +37,19 @@ class CAllReduceMaxOpMaker : public CAllReduceOpMaker {
   std::string GetName() const override { return "Max"; }
 };
 
+DECLARE_INPLACE_OP_INFERER(AllreduceMaxInplaceInferer, {"X", "Out"});
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_max, ops::CAllReduceOp,
-                             ops::CAllReduceMaxOpMaker);
+REGISTER_OPERATOR(
+    c_allreduce_max, ops::CAllReduceOp, ops::CAllReduceMaxOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::AllreduceMaxInplaceInferer)
 
 REGISTER_OP_CPU_KERNEL(c_allreduce_max,
                        ops::CAllReduceOpCPUKernel<ops::kRedMax, float>,
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
new file mode 100644
index 00000000000000..4dece4a3721ff5
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    c_allreduce_max, ops::CAllReduceOpASCENDKernel<ops::kRedMax, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
new file mode 100644
index 00000000000000..b7fd2739d51181
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -0,0 +1,188 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allreduce_max);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+
+  int num1 = 100;
+  int num2 = 100;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id * 3);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 4.0);
+  }
+}
+
+TEST(c_allreduce_max, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLAllReduceOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
new file mode 100644
index 00000000000000..b0aa51f7cfdfdc
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_allreduce_max,
+                       ops::CAllReduceOpXPUKernel<ops::kRedMax, float>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
index efc19659c83ec3..9d913b12b13767 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
@@ -37,14 +37,19 @@ class CAllReduceMinOpMaker : public CAllReduceOpMaker {
   std::string GetName() const override { return "Min"; }
 };
 
+DECLARE_INPLACE_OP_INFERER(AllreduceMinInplaceInferer, {"X", "Out"});
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_min, ops::CAllReduceOp,
-                             ops::CAllReduceMinOpMaker);
+REGISTER_OPERATOR(
+    c_allreduce_min, ops::CAllReduceOp, ops::CAllReduceMinOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::AllreduceMinInplaceInferer)
 
 REGISTER_OP_CPU_KERNEL(c_allreduce_min,
                        ops::CAllReduceOpCPUKernel<ops::kRedMin, float>,
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
new file mode 100644
index 00000000000000..48e1d2eeb58c52
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    c_allreduce_min, ops::CAllReduceOpASCENDKernel<ops::kRedMin, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
new file mode 100644
index 00000000000000..2f16a89c217dac
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_allreduce_min,
+                       ops::CAllReduceOpXPUKernel<ops::kRedMin, float>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 2f56f43d793fa9..3a74f551e7a30e 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -19,17 +19,31 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/memory/memory.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/collective_helper.h"
+#endif
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/fluid/platform/bkcl_helper.h"
+#endif
+
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/allreduce.h>
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -105,6 +119,136 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <ReduceType red_type, typename T>
+class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    auto place = ctx.GetPlace();
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
+    int64_t numel = in->numel();
+
+    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    out->mutable_data<T>(in->dims(), ctx.GetPlace());
+    void* recvbuff = reinterpret_cast<void*>(out->data<T>());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
+    switch (red_type) {
+      case kRedSum:
+        hccl_red_type = HCCL_REDUCE_SUM;
+        break;
+
+      case kRedMax:
+        hccl_red_type = HCCL_REDUCE_MAX;
+        break;
+
+      case kRedMin:
+        hccl_red_type = HCCL_REDUCE_MIN;
+        break;
+
+      case kRedProd:
+        hccl_red_type = HCCL_REDUCE_PROD;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    VLOG(3) << "begin hccl allreduce, parameter is: "
+            << "input num: " << numel << "dtype: " << dtype
+            << "hccl_red_type: " << hccl_red_type << ", group is: " << group;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
+        sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(),
+        reinterpret_cast<void*>(stream)));
+
+    out->Resize(in->dims());
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+template <ReduceType red_type, typename T>
+class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_XPU_BKCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    BKCLDataType dtype = platform::ToBKCLDataType(in->type());
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<void>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    auto comm = platform::BKCLCommContext::Instance().Get(rid, place);
+
+    XPUStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
+                   ->x_context()
+                   ->xpu_stream;
+    } else {
+      stream = comm->stream();
+    }
+
+    BKCLOp bkcl_red_type = BKCL_ADD;
+    switch (red_type) {
+      case kRedSum:
+        bkcl_red_type = BKCL_ADD;
+        break;
+
+      case kRedMax:
+        bkcl_red_type = BKCL_MAX;
+        break;
+
+      case kRedMin:
+        bkcl_red_type = BKCL_MIN;
+        break;
+
+      case kRedProd:
+        bkcl_red_type = BKCL_PRODUCT;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    PADDLE_ENFORCE_EQ(bkcl_all_reduce(comm->comm(), sendbuff, recvbuff, numel,
+                                      dtype, bkcl_red_type, stream),
+                      BKCL_SUCCESS, platform::errors::PreconditionNotMet(
+                                        "BKCL all reduce failed"));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should be compiled with XPU."));
+#endif
+  }
+};
+
 template <ReduceType red_type, typename T>
 class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -170,10 +314,20 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) the allreduced result.");
     AddAttr<int>("ring_id", "(int default 0) communication ring id.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for all reduce.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
         .SetDefault(false);
+    AddAttr<bool>(
+        "use_model_parallel",
+        "(bool default false) use this op with model parallel mode. In model "
+        "parallel mode, the backward is c_identity which returns itself for "
+        "c_allreduce_sum.")
+        .SetDefault(false);
     AddComment(string::Sprintf(R"DOC(
 CAllReduce %s Operator
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
index 5ab07ef026bac5..3ad078e1c8ff0f 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
@@ -37,14 +37,19 @@ class CAllReduceProdOpMaker : public CAllReduceOpMaker {
   std::string GetName() const override { return "Prod"; }
 };
 
+DECLARE_INPLACE_OP_INFERER(AllreduceProdInplaceInferer, {"X", "Out"});
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_prod, ops::CAllReduceOp,
-                             ops::CAllReduceProdOpMaker);
+REGISTER_OPERATOR(
+    c_allreduce_prod, ops::CAllReduceOp, ops::CAllReduceProdOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::AllreduceProdInplaceInferer)
 
 REGISTER_OP_CPU_KERNEL(c_allreduce_prod,
                        ops::CAllReduceOpCPUKernel<ops::kRedProd, float>,
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
new file mode 100644
index 00000000000000..f3d14afe0a1bc7
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    c_allreduce_prod, ops::CAllReduceOpASCENDKernel<ops::kRedProd, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
new file mode 100644
index 00000000000000..92ba00428065bc
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_allreduce_prod,
+                       ops::CAllReduceOpXPUKernel<ops::kRedProd, float>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
index 68061e6ae6bea0..18c317506c06e1 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
@@ -37,7 +37,12 @@ class CAllReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> {
 
  protected:
   void Apply(GradOpPtr<T> retv) const override {
-    retv->SetType("c_allreduce_sum");
+    bool use_mp = BOOST_GET_CONST(bool, this->GetAttr("use_model_parallel"));
+    if (use_mp) {
+      retv->SetType("c_identity");
+    } else {
+      retv->SetType("c_allreduce_sum");
+    }
     retv->SetInput("X", this->OutputGrad("Out"));
     retv->SetOutput("Out", this->InputGrad("X"));
     retv->SetAttrMap(this->Attrs());
@@ -49,6 +54,8 @@ class CAllReduceSumOpMaker : public CAllReduceOpMaker {
   std::string GetName() const override { return "Sum"; }
 };
 
+DECLARE_INPLACE_OP_INFERER(AllreduceSumInplaceInferer, {"X", "Out"});
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -58,7 +65,7 @@ namespace plat = paddle::platform;
 REGISTER_OPERATOR(c_allreduce_sum, ops::CAllReduceOp,
                   ops::CAllReduceSumOpGradMaker<paddle::framework::OpDesc>,
                   ops::CAllReduceSumOpGradMaker<paddle::imperative::OpBase>,
-                  ops::CAllReduceSumOpMaker);
+                  ops::CAllReduceSumOpMaker, ops::AllreduceSumInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(c_allreduce_sum,
                        ops::CAllReduceOpCPUKernel<ops::kRedSum, float>,
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
new file mode 100644
index 00000000000000..b66e2e1968908c
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    c_allreduce_sum, ops::CAllReduceOpASCENDKernel<ops::kRedSum, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
new file mode 100644
index 00000000000000..f1bf9683e35593
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -0,0 +1,189 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allreduce_sum);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(3) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
+                         int iter) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int num1 = 3;
+  int num2 = 128;
+
+  std::vector<float> init;
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  auto place = ctx.GetPlace();
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 3.0);
+  }
+}
+
+TEST(c_allreduce_sum, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  // only support one device, if more than one device, use first default
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  for (int i = 0; i < 1; i++) {
+    VLOG(2) << "iter num: " << i;
+    TestHCCLAllReduceOp(&scope, ctx, i);
+  }
+}
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
new file mode 100644
index 00000000000000..e4ec538cd23230
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_allreduce_sum,
+                       ops::CAllReduceOpXPUKernel<ops::kRedSum, float>)
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cc b/paddle/fluid/operators/collective/c_broadcast_op.cc
index 928fa8549ffb92..271d543eb2364d 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cc
@@ -42,6 +42,10 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(0);
     AddAttr<int>("root", "(int default 0) root id for broadcasting.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
new file mode 100644
index 00000000000000..a60ba86572822c
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    int numel = x->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
+
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int root = ctx.Attr<int>("root");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+
+    VLOG(3) << "begin hccl broadcast, parameter is: "
+            << "root " << root << ", group is " << group
+            << ", comm: " << comm->comm() << ", stream: " << stream;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
+        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
+
+    VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
+            << framework::product(out->dims());
+
+    dev_ctx->Wait();
+
+    if (out != x) {
+      framework::TensorCopy(*static_cast<const framework::Tensor*>(x), place,
+                            *platform::DeviceContextPool::Instance().Get(place),
+                            static_cast<framework::Tensor*>(out));
+    }
+    dev_ctx->Wait();
+
+    out->Resize(x->dims());
+    out->set_lod(x->lod());
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_broadcast, ops::CBroadcastOpASCENDKernel<int>,
+                       ops::CBroadcastOpASCENDKernel<int8_t>,
+                       ops::CBroadcastOpASCENDKernel<float>,
+                       ops::CBroadcastOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
new file mode 100644
index 00000000000000..9e39613f3fbe3a
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -0,0 +1,181 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_broadcast);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  int num = 2;
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+
+  for (int64_t i = 0; i < num * num; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num, num});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num, num});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["root"] = 0;
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 1.0);
+  }
+}
+
+TEST(c_broadcast, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLBroadcastOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
new file mode 100644
index 00000000000000..7817f19bacb187
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+class CCommInitOpAscend : public framework::OperatorBase {
+ public:
+  CCommInitOpAscend(const std::string& type,
+                    const framework::VariableNameMap& inputs,
+                    const framework::VariableNameMap& outputs,
+                    const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "CCommInitOpAscend can run on npu place only."));
+
+    auto var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::InvalidArgument("Input con not be empty."));
+#if defined(PADDLE_WITH_ASCEND_CL)
+    HcclRootInfo* hccl_id = var->GetMutable<HcclRootInfo>();
+
+    int rank_ids = Attr<int>("rank_ids");
+    int rank_id = Attr<int>("rank");
+    int rid = Attr<int>("ring_id");
+    int device_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
+    if (Attr<int>("device_id") >= 0) {
+      device_id = Attr<int>("device_id");
+    }
+    platform::HCCLCommContext::Instance().CreateHCCLComm(
+        hccl_id, rank_ids, rank_id, device_id, rid);
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+class CCommInitOpAscendMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Raw variable contains a NCCL UniqueId instaces.");
+    AddComment(R"DOC(
+CCommInit operator
+
+Initialize collective communicatoin context within this trainer
+)DOC");
+    AddAttr<int>("rank_ids",
+                 "(int) The number of ranks of distributed trainers");
+    AddAttr<int>("rank",
+                 "(int) The rank of the trainer in distributed training.");
+    AddAttr<int>("device_id",
+                 "(int) The deivce_id on which to initialize the communicator."
+                 "Now, you only have to set this attr manually for pipeline "
+                 "training. Otherwise, make it as default.")
+        .SetDefault(-1);
+    AddAttr<int>("ring_id", "(int default 0) user specified ring id")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(c_comm_init_hccl, ops::CCommInitOpAscend,
+                  ops::CCommInitOpAscendMaker);
diff --git a/paddle/fluid/operators/collective/c_concat_op.cc b/paddle/fluid/operators/collective/c_concat_op.cc
new file mode 100644
index 00000000000000..551fde21162582
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_concat_op.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_concat_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CConcatOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_concat");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_concat");
+    int nranks = ctx->Attrs().Get<int>("nranks");
+    int rank = ctx->Attrs().Get<int>("rank");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
+                                     "The number of ranks (%d) for c_concat "
+                                     "must be greater than 1.",
+                                     nranks));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_concat must be non-negative.", ring_id));
+    PADDLE_ENFORCE_GE(
+        rank, 0, platform::errors::InvalidArgument(
+                     "The rank (%d) for c_concat must be non-negative.", rank));
+    PADDLE_ENFORCE_LT(rank, nranks,
+                      platform::errors::InvalidArgument(
+                          "The value of rank (%d) for c_concat must "
+                          "be less than that of nranks.",
+                          rank, nranks));
+
+    framework::DDim dim = ctx->GetInputDim("X");
+    dim[dim.size() - 1] = dim[dim.size() - 1] * nranks;
+    if (dim[dim.size() - 1] < 0) dim[dim.size() - 1] = -1;
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class CConcatOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("c_split");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetOutput("Out", this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+class CConcatOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor to be concated.");
+    AddOutput("Out", "(Tensor) the result of concat.");
+    AddAttr<int>("rank", "(int default 0) rank id.").SetDefault(0);
+    AddAttr<int>("nranks", "(int default 1) number of ranks.").SetDefault(1);
+    AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default true) eject CUDA operations to calculation stream.")
+        .SetDefault(true);
+    AddAttr<bool>("use_model_parallel",
+                  "(bool default true) use this op with model parallel.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+CConcat Operator
+AllGather the tensors on different trainers and concat them along the last dimension.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(c_concat, ops::CConcatOp,
+                  ops::CConcatOpGradMaker<paddle::framework::OpDesc>,
+                  ops::CConcatOpGradMaker<paddle::imperative::OpBase>,
+                  ops::CConcatOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_concat, ops::CConcatOpCPUKernel<float>,
+                       ops::CConcatOpCPUKernel<double>,
+                       ops::CConcatOpCPUKernel<int>,
+                       ops::CConcatOpCPUKernel<int64_t>,
+                       ops::CConcatOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc
new file mode 100644
index 00000000000000..bfdc49c440aae7
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -0,0 +1,110 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "paddle/fluid/operators/collective/c_concat_op.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CConcatOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
+
+    int nranks = ctx.Attr<int>("nranks");
+    int rank = ctx.Attr<int>("rank");
+    int rid = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_GE(rank, 0,
+                      platform::errors::PreconditionNotMet(
+                          "The value of rank (%d) for c_concat must be "
+                          "greater than or equal to 0.",
+                          rank));
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::PreconditionNotMet(
+                          "The value of nranks (%d) for c_concat must be "
+                          "greater than or equal to 2.",
+                          nranks));
+    PADDLE_ENFORCE_LT(rank, nranks,
+                      platform::errors::PreconditionNotMet(
+                          "The value of rank (%d) for c_concat must be "
+                          "less than that of nranks (%d).",
+                          rank, nranks));
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    PADDLE_ENFORCE_EQ(
+        nranks, comm->nranks(),
+        platform::errors::InvalidArgument("nranks: %s should equal to %s",
+                                          nranks, comm->nranks()));
+
+    framework::Tensor temp_out;
+    framework::DDim temp_out_dims = x->dims();
+    temp_out_dims[0] *= nranks;
+    temp_out.mutable_data<T>(temp_out_dims, place);
+    int64_t send_numel = x->numel();
+    const T* send_buff = x->data<T>();
+    T* recv_buff = temp_out.data<T>();
+    gpuStream_t stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+        send_buff, recv_buff, send_numel, static_cast<ncclDataType_t>(dtype),
+        comm->comm(), stream));
+
+    std::vector<framework::Tensor> inputs;
+    int axis = x->dims().size() - 1;
+    auto out_dims = x->dims();
+    out_dims[out_dims.size() - 1] *= nranks;
+    int rows_per_tensor = x->dims()[0];
+    int offset = 0;
+    for (int i = 0; i < nranks; i++) {
+      framework::Tensor temp = temp_out.Slice(offset, offset + rows_per_tensor);
+      inputs.emplace_back(temp);
+      offset += rows_per_tensor;
+    }
+
+    math::ConcatFunctor<platform::CUDADeviceContext, T> functor;
+    out->mutable_data<T>(out_dims, place);
+    auto& dev_ctx2 = ctx.template device_context<platform::CUDADeviceContext>();
+    functor(dev_ctx2, inputs, axis, out);
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_concat, ops::CConcatOpCUDAKernel<float>,
+                        ops::CConcatOpCUDAKernel<double>,
+                        ops::CConcatOpCUDAKernel<int>,
+                        ops::CConcatOpCUDAKernel<int64_t>,
+                        ops::CConcatOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_concat_op.h b/paddle/fluid/operators/collective/c_concat_op.h
new file mode 100644
index 00000000000000..55a5799e37b6f5
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_concat_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CConcatOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_concat for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
new file mode 100644
index 00000000000000..593eaf923a9784
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class CGenHCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenHCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    int rank = Attr<int>("rank");
+    framework::Scope& local_scope = scope.NewScope();
+
+    std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
+      return Output("Out");
+    };
+
+    if (rank == 0) {
+      std::vector<std::string> endpoint_list =
+          Attr<std::vector<std::string>>("other_endpoints");
+      SendBroadCastHCCLID(endpoint_list, 1, func, local_scope);
+    } else {
+      std::string endpoint = Attr<std::string>("endpoint");
+      RecvBroadCastHCCLID(endpoint, 1, func, local_scope);
+    }
+    scope.DeleteScope(&local_scope);
+  }
+};
+
+#else
+
+class CGenHCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenHCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {}
+};
+
+#endif
+
+class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    VLOG(3) << "ele";
+    AddOutput("Out", "Raw variable contains a HCCL UniqueId instaces.");
+    AddComment(R"DOC(
+CGenHCCLId operator
+
+For trainer 0: generate a new UniqueId and send it to all the other trainers.
+For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string), e.g. 127.0.0.1:6175 "
+                         "current listen endpoint");
+    AddAttr<std::vector<std::string>>(
+        "other_endpoints",
+        "['trainer1_ip:port', 'trainer2_ip:port', ...] "
+        "list of other trainer endpoints")
+        .SetDefault({});
+    AddAttr<int>("rank",
+                 "(int default 0) "
+                 "The rank of the trainer in distributed training.")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(c_gen_hccl_id, ops::CGenHCCLIdOp, ops::CGenHCCLIdOpMaker);
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index 7da30f64d1ce39..470537582e9783 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -66,6 +66,9 @@ class CGenNCCLIdOp : public framework::OperatorBase {
       return Output("Out");
     };
 
+    std::string endpoint = Attr<std::string>("endpoint");
+    int server_fd = platform::SocketServer::GetInstance(endpoint).socket();
+
     std::vector<ncclUniqueId> nccl_ids;
     nccl_ids.resize(1);
 
@@ -75,8 +78,6 @@ class CGenNCCLIdOp : public framework::OperatorBase {
           Attr<std::vector<std::string>>("other_endpoints");
       platform::SendBroadCastCommID(endpoint_list, &nccl_ids);
     } else {
-      std::string endpoint = Attr<std::string>("endpoint");
-      int server_fd = platform::SocketServer::GetInstance(endpoint).socket();
       platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids);
     }
 
diff --git a/paddle/fluid/operators/collective/c_identity_op.cc b/paddle/fluid/operators/collective/c_identity_op.cc
new file mode 100644
index 00000000000000..646c27b90e17ea
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_identity_op.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_identity_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CIdentityOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_identity");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_identity");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_identity must be non-negative.", ring_id));
+    framework::DDim dim = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class CIdentityOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) identity tensor.");
+    AddOutput("Out", "(Tensor) identity tensor.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default true) eject CUDA operations to calculation stream.")
+        .SetDefault(true);
+    AddAttr<bool>("use_model_parallel",
+                  "(bool default true) use this op with model parallel.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+Identity Operator which returns a copy of itself.
+)DOC");
+  }
+};
+
+template <typename T>
+class CIdentityOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("c_allreduce_sum");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetOutput("Out", this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(c_identity, ops::CIdentityOp,
+                  ops::CIdentityOpGradMaker<paddle::framework::OpDesc>,
+                  ops::CIdentityOpGradMaker<paddle::imperative::OpBase>,
+                  ops::CIdentityOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_identity, ops::CIdentityOpCPUKernel<float>,
+                       ops::CIdentityOpCPUKernel<double>,
+                       ops::CIdentityOpCPUKernel<int>,
+                       ops::CIdentityOpCPUKernel<int64_t>,
+                       ops::CIdentityOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_identity_op.cu.cc b/paddle/fluid/operators/collective/c_identity_op.cu.cc
new file mode 100644
index 00000000000000..8ccf40e317aded
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_identity_op.cu.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_identity_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CIdentityOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+
+    int rid = ctx.Attr<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        rid, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_identity op must be non-negative.", rid));
+    out->mutable_data<T>(ctx.GetPlace());
+
+    TensorCopy(*x, out->place(), out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_identity, ops::CIdentityOpCUDAKernel<float>,
+                        ops::CIdentityOpCUDAKernel<double>,
+                        ops::CIdentityOpCUDAKernel<int>,
+                        ops::CIdentityOpCUDAKernel<int64_t>,
+                        ops::CIdentityOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_identity_op.h b/paddle/fluid/operators/collective/c_identity_op.h
new file mode 100644
index 00000000000000..ca817fb6bac0e1
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_identity_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CIdentityOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_identity for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
new file mode 100644
index 00000000000000..f35b4c2f707226
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reduce_max,
+                       ops::CReduceOpASCENDKernel<ops::kRedMax, int>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMax, int8_t>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMax, float>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
new file mode 100644
index 00000000000000..6d3af7bb5f258b
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_reduce_max,
+                       ops::CReduceOpXPUKernel<ops::kRedMax, float>)
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
new file mode 100644
index 00000000000000..6ebb7e4c40e68e
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reduce_min,
+                       ops::CReduceOpASCENDKernel<ops::kRedMin, int>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMin, int8_t>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMin, float>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
new file mode 100644
index 00000000000000..791e58d8493cec
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_reduce_min,
+                       ops::CReduceOpXPUKernel<ops::kRedMin, float>)
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index 1bce01e13a2ad2..fa9fd079d8e48b 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -24,15 +24,28 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
+#endif
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
+
+#if defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/fluid/platform/bkcl_helper.h"
+#endif
+
 #if defined(PADDLE_WITH_GLOO)
 #include <gloo/reduce.h>
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -110,6 +123,148 @@ class CReduceOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <ReduceType red_type, typename T>
+class CReduceOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    auto place = ctx.GetPlace();
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
+    int64_t numel = in->numel();
+
+    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    void* recvbuff = reinterpret_cast<void*>(out->data<T>());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    int root_id = ctx.Attr<int>("root_id");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int rank_id = comm->rank();
+
+    HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
+    switch (red_type) {
+      case kRedSum:
+        hccl_red_type = HCCL_REDUCE_SUM;
+        break;
+
+      case kRedMax:
+        hccl_red_type = HCCL_REDUCE_MAX;
+        break;
+
+      case kRedMin:
+        hccl_red_type = HCCL_REDUCE_MIN;
+        break;
+
+      case kRedProd:
+        hccl_red_type = HCCL_REDUCE_PROD;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    VLOG(3) << "begin hccl reduce, parameter is: "
+            << "input num: " << numel << "root_id: " << root_id
+            << "dtype: " << dtype << "hccl_red_type: " << hccl_red_type
+            << ", group is: " << group;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
+        sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(),
+        reinterpret_cast<void*>(stream)));
+
+    if (rank_id != root_id) {
+      auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
+      memory::Copy(npu_place, reinterpret_cast<void*>(out->data<T>()),
+                   npu_place,
+                   reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
+                   numel * sizeof(T), stream);
+    }
+
+    out->Resize(in->dims());
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+template <ReduceType red_type, typename T>
+class CReduceOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_XPU_BKCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    auto place = ctx.GetPlace();
+    BKCLDataType dtype = platform::ToBKCLDataType(in->type());
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data<void>();
+    out->Resize(in->dims());
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    int root = ctx.Attr<int>("root_id");
+    auto comm = platform::BKCLCommContext::Instance().Get(rid, place);
+
+    XPUStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
+                   ->x_context()
+                   ->xpu_stream;
+    } else {
+      stream = comm->stream();
+    }
+
+    BKCLOp bkcl_red_type = BKCL_ADD;
+    switch (red_type) {
+      case kRedSum:
+        bkcl_red_type = BKCL_ADD;
+        break;
+
+      case kRedMax:
+        bkcl_red_type = BKCL_MAX;
+        break;
+
+      case kRedMin:
+        bkcl_red_type = BKCL_MIN;
+        break;
+
+      case kRedProd:
+        bkcl_red_type = BKCL_PRODUCT;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    PADDLE_ENFORCE_EQ(bkcl_reduce(comm->comm(), sendbuff, recvbuff, numel,
+                                  dtype, bkcl_red_type, root, stream),
+                      BKCL_SUCCESS, platform::errors::PreconditionNotMet(
+                                        "BKCL all reduce failed"));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should be compiled with XPU."));
+#endif
+  }
+};
+
 template <ReduceType red_type, typename T>
 class CReduceOpCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -179,6 +334,10 @@ class CReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) the reduced result.");
     AddAttr<int>("ring_id", "(int default 0) communication ring id.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for reduce.")
+        .SetDefault("tag");
+#endif
     AddAttr<int>("root_id", "(int default 0) root id.").SetDefault(0);
     AddAttr<bool>(
         "use_calc_stream",
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
new file mode 100644
index 00000000000000..f0b7021e7997d9
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reduce_prod,
+                       ops::CReduceOpASCENDKernel<ops::kRedProd, int>,
+                       ops::CReduceOpASCENDKernel<ops::kRedProd, int8_t>,
+                       ops::CReduceOpASCENDKernel<ops::kRedProd, float>,
+                       ops::CReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
new file mode 100644
index 00000000000000..e7e770e8ffdcaf
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_reduce_prod,
+                       ops::CReduceOpXPUKernel<ops::kRedProd, float>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
new file mode 100644
index 00000000000000..dd4dbbd5f36457
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reduce_sum,
+                       ops::CReduceOpASCENDKernel<ops::kRedSum, int>,
+                       ops::CReduceOpASCENDKernel<ops::kRedSum, int8_t>,
+                       ops::CReduceOpASCENDKernel<ops::kRedSum, float>,
+                       ops::CReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
new file mode 100644
index 00000000000000..3683c7722ba3bf
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -0,0 +1,192 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_reduce_sum);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(3) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int num1 = 3;
+  int num2 = 128;
+
+  std::vector<float> init;
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  auto place = ctx.GetPlace();
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
+  attrs["ring_id"] = 0;
+  int root_id = 0;
+  attrs["root_id"] = root_id;
+
+  auto op = f::OpRegistry::CreateOp("c_reduce_sum", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    if (rank_id == root_id) {
+      EXPECT_EQ(out_vec[i], 3.0);
+    } else {
+      EXPECT_EQ(out_vec[i], init[i]);
+    }
+  }
+}
+
+TEST(c_reduce_sum, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  for (int i = 0; i < 2; i++) {
+    VLOG(2) << "iter num: " << i;
+    TestHCCLReduceOp(&scope, ctx, i);
+  }
+}
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
new file mode 100644
index 00000000000000..a0ec4d2a99cd71
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct XPUPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_XPU_KERNEL(c_reduce_sum,
+                       ops::CReduceOpXPUKernel<ops::kRedSum, float>)
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cc
index ada1fd2b1270cc..7836f11dc9b1fb 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cc
@@ -49,6 +49,10 @@ class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("nranks",
                  "Total trainer count of the distributed training job")
         .SetDefault(1);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for reduce scatter.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.h b/paddle/fluid/operators/collective/c_reducescatter_op.h
index 366d8a3747cfb7..490b152bc2d302 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.h
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
new file mode 100644
index 00000000000000..44096a82c34d61
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    auto place = ctx.GetPlace();
+    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+    int nranks = comm->nranks();
+
+    auto out_dims = in->dims();
+    PADDLE_ENFORCE_EQ(out_dims[0] % nranks, 0,
+                      platform::errors::InvalidArgument(
+                          "The input tensor X's "
+                          "dim[0] (%d) should be divisible by nranks(%d)",
+                          out_dims[0], nranks));
+
+    out_dims[0] = out_dims[0] / nranks;
+    out->mutable_data<T>(out_dims, place);
+
+    uint64_t recv_numel = in->numel() / nranks;
+
+    void* inputPtr = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    void* outputPtr = reinterpret_cast<void*>(out->data<T>());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
+
+    aclrtStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+    VLOG(3) << "begin hccl reduce scatter, parameter is: "
+            << "recv_numel: " << recv_numel << "dtype: " << dtype
+            << "hccl_red_type: " << HCCL_REDUCE_SUM << ", group is: " << group;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclReduceScatter(
+        inputPtr, outputPtr, recv_numel, dtype, HCCL_REDUCE_SUM, comm->comm(),
+        reinterpret_cast<void*>(stream)));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reducescatter,
+                       ops::CReduceScatterOpAscendKernel<int8_t>,
+                       ops::CReduceScatterOpAscendKernel<int>,
+                       ops::CReduceScatterOpAscendKernel<float>,
+                       ops::CReduceScatterOpAscendKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
new file mode 100644
index 00000000000000..f82f050a7206fe
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -0,0 +1,189 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_reducescatter);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int num1 = 4;
+  int num2 = 1;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+  attrs["nranks"] = 2;
+
+  auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  int iter_num = 10;
+  for (int i = 0; i < iter_num; i++) {
+    op->Run(*scope, place);
+    ctx.Wait();
+  }
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+  EXPECT_EQ(out_vec.size(), init.size() / 2);
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 2.0);
+  }
+}
+
+TEST(c_reducescatter, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLReduceScatterOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_split_op.cc b/paddle/fluid/operators/collective/c_split_op.cc
new file mode 100644
index 00000000000000..03046d571d0f05
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_split_op.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_split_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CSplitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_split");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_split");
+    int nranks = ctx->Attrs().Get<int>("nranks");
+    int rank = ctx->Attrs().Get<int>("rank");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
+                                     "The number of ranks (%d) for c_split "
+                                     "must be greater than 1.",
+                                     nranks));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_split must be non-negative.", ring_id));
+    PADDLE_ENFORCE_GE(
+        rank, 0, platform::errors::InvalidArgument(
+                     "The rank (%d) for c_split must be non-negative.", rank));
+    PADDLE_ENFORCE_LT(rank, nranks,
+                      platform::errors::InvalidArgument(
+                          "The value of rank (%d) for c_split must "
+                          "be less than that of nranks.",
+                          rank, nranks));
+
+    framework::DDim dim = ctx->GetInputDim("X");
+    dim[dim.size() - 1] = dim[dim.size() - 1] / nranks;
+    if (dim[0] < 0) dim[0] = -1;
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class CSplitOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("c_allgather");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetOutput("Out", this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+class CSplitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor to be split.");
+    AddOutput("Out", "(Tensor) the result of split.");
+    AddAttr<int>("rank", "(int default 0) rank id.").SetDefault(0);
+    AddAttr<int>("nranks", "(int default 1) number of ranks.").SetDefault(1);
+    AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddAttr<bool>("use_model_parallel",
+                  "(bool default false) use this op with model parallel.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+CSplit Operator
+Split the tensor evenly according to its rank.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(c_split, ops::CSplitOp,
+                  ops::CSplitOpGradMaker<paddle::framework::OpDesc>,
+                  ops::CSplitOpGradMaker<paddle::imperative::OpBase>,
+                  ops::CSplitOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_split, ops::CSplitOpCPUKernel<float>,
+                       ops::CSplitOpCPUKernel<double>,
+                       ops::CSplitOpCPUKernel<int>,
+                       ops::CSplitOpCPUKernel<int64_t>,
+                       ops::CSplitOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_split_op.cu.cc b/paddle/fluid/operators/collective/c_split_op.cu.cc
new file mode 100644
index 00000000000000..92a7f5e41b1d2d
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_split_op.cu.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "paddle/fluid/operators/collective/c_split_op.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSplitOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    int nranks = ctx.Attr<int>("nranks");
+    int rank = ctx.Attr<int>("rank");
+    auto place = ctx.GetPlace();
+
+    PADDLE_ENFORCE_GE(rank, 0, platform::errors::PreconditionNotMet(
+                                   "The value of rank (%d) for c_split must be "
+                                   "greater than or equal to 0.",
+                                   rank));
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::PreconditionNotMet(
+                          "The value of nranks (%d) for c_split must be "
+                          "greater than or equal to 2.",
+                          nranks));
+    PADDLE_ENFORCE_LT(rank, nranks,
+                      platform::errors::PreconditionNotMet(
+                          "The value of rank (%d) for c_split must be "
+                          "less than that of nranks (%d).",
+                          rank, nranks));
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    std::vector<const framework::Tensor*> shape_refer;
+    std::vector<framework::Tensor*> results;
+    size_t numel = x->numel();
+    auto dims = x->dims();
+    numel /= nranks;
+    int axis = dims.size() - 1;
+    dims[dims.size() - 1] /= nranks;
+    for (int i = 0; i < nranks; i++) {
+      framework::Tensor* out = new framework::Tensor();
+      out->mutable_data<T>(dims, place);
+      shape_refer.emplace_back(out);
+      results.emplace_back(out);
+    }
+
+    math::SplitFunctor<platform::CUDADeviceContext, T> functor;
+    functor(dev_ctx, *x, shape_refer, axis, &results);
+    out->mutable_data<T>(dims, place);
+    paddle::framework::TensorCopySync(*results[rank], out->place(), out);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_split, ops::CSplitOpCUDAKernel<float>,
+                        ops::CSplitOpCUDAKernel<double>,
+                        ops::CSplitOpCUDAKernel<int>,
+                        ops::CSplitOpCUDAKernel<int64_t>,
+                        ops::CSplitOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_split_op.h b/paddle/fluid/operators/collective/c_split_op.h
new file mode 100644
index 00000000000000..ea0c7fc45c66b8
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_split_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSplitOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_split for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index 700d1173e2ff68..71ab25a7b0ff8a 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -46,7 +46,7 @@ Call calculation stream synchronization.
 };
 
 template <typename T>
-class CSyncCalcStreamCudaKernel : public framework::OpKernel<T> {
+class CSyncCalcStreamKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
@@ -61,6 +61,16 @@ class CSyncCalcStreamCudaKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream()));
 #endif
 
+#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on npu place only for now."));
+
+    auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(dev_ctx->stream()));
+
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
@@ -76,5 +86,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream, ops::CSyncCalcStreamOp,
                              ops::CSyncCalcStreamOpMaker);
 
-REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream,
-                        ops::CSyncCalcStreamCudaKernel<float>);
+REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
+
+REGISTER_OP_NPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel<float>);
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
new file mode 100644
index 00000000000000..45613715b8260c
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(elementwise_add);
+USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
+USE_OP_DEVICE_KERNEL(c_sync_calc_stream, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  auto y = scope->Var("Y");
+  auto tensor_y = y->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_x;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_x.push_back(static_cast<T>(1.0));
+  }
+
+  std::vector<T> init_y;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_y.push_back(static_cast<T>(2.0));
+  }
+
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize({10, 10});
+  TensorFromVector(init_y, ctx, tensor_y);
+  tensor_y->Resize({10, 10});
+
+  f::AttributeMap attrs;
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  // sync data
+  auto sync_op0 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
+                                          {{"Out", {"Out"}}}, attrs);
+  sync_op0->Run(*scope, place);
+
+  // run
+
+  auto op =
+      f::OpRegistry::CreateOp("elementwise_add", {{"X", {"X"}}, {"Y", {"Y"}}},
+                              {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+
+  // sync op run
+  auto sync_op = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
+                                         {{"Out", {"Out"}}}, attrs);
+  sync_op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  // sync op copy
+  auto sync_op2 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
+                                          {{"Out", {"Out"}}}, attrs);
+  sync_op2->Run(*scope, place);
+
+  float expected = 3.0;
+
+  EXPECT_EQ(out_vec.size(), init_x.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], static_cast<T>(expected));
+  }
+}
+
+TEST(c_sync_calc_stream, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
index 95b9cd040fe94e..71fda2cd01c8d6 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -19,6 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -53,13 +58,11 @@ Call communication stream synchronization.
 };
 
 template <typename T>
-class CSyncCommStreamCudaKernel : public framework::OpKernel<T> {
+class CSyncCommStreamKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-
     auto place = ctx.GetPlace();
-
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     int ring_id = ctx.Attr<int>("ring_id");
     auto stream =
         platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
@@ -70,6 +73,15 @@ class CSyncCommStreamCudaKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
 #endif
 
+#elif defined(PADDLE_WITH_ASCEND_CL)
+    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on npu place only for now."));
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto stream =
+        platform::HCCLCommContext::Instance().Get(ring_id, place)->stream();
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream));
+
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
@@ -85,5 +97,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream, ops::CSyncCommStreamOp,
                              ops::CSyncCommStreamOpMaker);
 
-REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream,
-                        ops::CSyncCommStreamCudaKernel<float>);
+REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel<float>);
+
+REGISTER_OP_NPU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel<float>);
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
new file mode 100644
index 00000000000000..6c5a6db61483dc
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -0,0 +1,190 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_broadcast);
+USE_OP_DEVICE_KERNEL(c_sync_comm_stream, NPU);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  int num = 2;
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+  std::cout << "rank_id:" << rank_id << std::endl;
+  for (int64_t i = 0; i < num * num; ++i) {
+    init.push_back(1.0 + rank_id);
+    std::cout << init[0];
+  }
+  std::cout << std::endl;
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num, num});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num, num});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["root"] = 0;
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  op->Run(*scope, place);
+
+  // comm sync
+
+  auto sync_op = f::OpRegistry::CreateOp(
+      "c_sync_comm_stream", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
+  sync_op->Run(*scope, place);
+
+  // ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 1.0);
+  }
+}
+
+TEST(c_sync_comm_stream_op, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLBroadcastOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op.cc b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
new file mode 100644
index 00000000000000..0cb2dd188725f8
--- /dev/null
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
@@ -0,0 +1,216 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ostream>
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+namespace paddle {
+namespace operators {
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class GenHCCLIdOp : public framework::OperatorBase {
+ public:
+  GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    std::vector<std::string> trainers =
+        Attr<std::vector<std::string>>("trainers");
+    int trainer_id = Attr<int>("trainer_id");
+    std::string endpoint = trainers[trainer_id];
+
+    PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
+                                         "trainer_id %d is less than 0. Its "
+                                         "valid range is [0, trainer_size)"));
+    PADDLE_ENFORCE_LT(
+        trainer_id, static_cast<int>(trainers.size()),
+        platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
+                                     "range is [0, trainer_size)",
+                                     trainer_id));
+
+    int hccl_comm_num = Attr<int>("hccl_comm_num");
+    int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
+    int inter_nranks = Attr<int>("hierarchical_allreduce_inter_nranks");
+    int inter_trainer_id = -1;
+    int exter_trainer_id = -1;
+
+    if (use_hierarchical_allreduce) {
+      PADDLE_ENFORCE_GT(
+          trainers.size(), 1,
+          platform::errors::PreconditionNotMet(
+              "The number of collective trainers %llu <= 1", trainers.size()));
+      PADDLE_ENFORCE_GT(
+          inter_nranks, 1,
+          platform::errors::PreconditionNotMet(
+              "inter_nranks %d <= 1 while in hierarchical allreduce mode",
+              inter_nranks));
+      PADDLE_ENFORCE_EQ(
+          trainers.size() % inter_nranks, 0,
+          platform::errors::PreconditionNotMet(
+              "The number of trainers %llu mod inter_nranks %d is not equal 0",
+              trainers.size(), inter_nranks));
+
+      inter_trainer_id = trainer_id % inter_nranks;
+
+      if (trainer_id % inter_nranks == 0) {
+        exter_trainer_id = trainer_id / inter_nranks;
+      }
+    }
+
+    std::ostringstream ss;
+    for (size_t i = 0; i < trainers.size(); i++) {
+      ss << trainers[i] << ",";
+    }
+
+    VLOG(1) << "trainer_id:" << trainer_id
+            << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
+            << ", hccl_comm_num:" << hccl_comm_num
+            << ", inter_nranks:" << inter_nranks
+            << ", inter_trainer_id:" << inter_trainer_id
+            << ", exter_trainer_id:" << exter_trainer_id
+            << ", trainers:" << ss.str();
+
+    int server_fd = -1;
+
+    /// 1. init flat
+    std::function<std::string(size_t)> func = platform::GetFlatHCCLVarName;
+    if (trainer_id == 0) {
+      // server endpoints
+      std::vector<std::string> flat_endpoints;
+      flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1,
+                            trainers.end());
+      SendBroadCastHCCLID(flat_endpoints, hccl_comm_num, func, scope);
+    } else {
+      server_fd = CreateListenSocket(endpoint);
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    /// 2. hierarchical inter ncclid
+    func = platform::GetHierarchicalInterHCCLVarName;
+    if (inter_trainer_id == 0) {
+      std::ostringstream ss;
+      ss << endpoint;
+      std::vector<std::string> inter_endpoints;
+      for (int i = trainer_id + 1; i < trainer_id + inter_nranks &&
+                                   i < static_cast<int>(trainers.size());
+           i++) {
+        ss << ",";
+        inter_endpoints.push_back(trainers[i]);
+        ss << trainers[i];
+      }
+      VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
+
+      SendBroadCastHCCLID(inter_endpoints, hccl_comm_num, func, scope);
+    } else if (inter_trainer_id > 0) {
+      VLOG(1) << "Hierarchical inter ring";
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    /// 3. hierarchical exter ncclid
+    func = platform::GetHierarchicalExterHCCLVarName;
+    if (exter_trainer_id == 0) {
+      std::ostringstream ss;
+      std::vector<std::string> exter_endpoints;
+      ss << endpoint;
+      for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) {
+        ss << ",";
+        exter_endpoints.push_back(trainers[i]);
+        ss << trainers[i];
+      }
+      VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
+
+      SendBroadCastHCCLID(exter_endpoints, hccl_comm_num, func, scope);
+    } else if (exter_trainer_id > 0) {
+      VLOG(1) << "Hierarchical exter ring";
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    // close socket server
+    if (trainer_id != 0) {
+      CloseSocket(server_fd);
+    }
+  }
+};
+
+#else
+class GenHCCLIdOp : public framework::OperatorBase {
+ public:
+  GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {}
+};
+
+#endif
+
+class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("HCCLID", "Raw variable contains a HCCL UniqueId instaces.");
+    AddComment(R"DOC(
+GenHCCLId operator
+
+For trainer 0: generate a new UniqueId and send it to all the other trainers.
+For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
+)DOC");
+    AddAttr<std::vector<std::string>>(
+        "trainers",
+        "['trainer0_ip:port', 'trainer1_ip:port', ...] "
+        "list of all trainer endpoints")
+        .SetDefault({});
+    AddAttr<int>("trainer_id",
+                 "(int) "
+                 "The index of the trainer in distributed training.");
+    AddAttr<int>("hccl_comm_num",
+                 "(int default 1) "
+                 "The number of nccl communicator num.")
+        .SetDefault(1);
+    AddAttr<bool>("use_hierarchical_allreduce",
+                  "(bool default false) "
+                  "Wheter to use hierarchical allreduce.")
+        .SetDefault(false);
+    AddAttr<int>("hierarchical_allreduce_inter_nranks",
+                 "(int default 1) "
+                 "Wheter to use hierarchical allreduce.")
+        .SetDefault(-1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(gen_hccl_id, ops::GenHCCLIdOp, ops::GenHCCLIdOpMaker);
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
new file mode 100644
index 00000000000000..15940a76f71105
--- /dev/null
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
@@ -0,0 +1,350 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+
+#include <algorithm>
+#include <ostream>
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/split.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+constexpr char COMM_HEAD[] = "_pd_gen_comm_id_";
+#define HCCL_UNIQUE_ID_BYTES 1024
+
+// Check system calls, such as socket, bind.
+#define CHECK_SYS_CALL(call, name)          \
+  do {                                      \
+    int retval;                             \
+    CHECK_SYS_CALL_VAL(call, name, retval); \
+  } while (false)
+
+#define CHECK_SYS_CALL_VAL(call, name, retval)                            \
+  do {                                                                    \
+    RETRY_SYS_CALL_VAL(call, name, retval);                               \
+    if (retval == -1) {                                                   \
+      PADDLE_THROW(platform::errors::Unavailable("Call to %s failed: %s", \
+                                                 name, strerror(errno))); \
+    }                                                                     \
+  } while (false)
+
+#define RETRY_SYS_CALL_VAL(call, name, retval)                           \
+  do {                                                                   \
+    retval = (call);                                                     \
+    if (retval == -1 &&                                                  \
+        (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) {   \
+      LOG(WARNING) << "Call " << name << " returned " << strerror(errno) \
+                   << " retry";                                          \
+    } else {                                                             \
+      break;                                                             \
+    }                                                                    \
+  } while (true)
+
+static int SocketSend(int fd, const char* buffer, int size) {
+  int offset = 0;
+  int bytes = 0;
+  while (offset < size) {
+    bytes = send(fd, buffer + offset, size - offset, 0);
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        // send failed
+        return -1;
+      } else {
+        bytes = 0;
+      }
+    }
+    offset += bytes;
+  }
+  return offset;
+}
+
+static int SocketRecv(int fd, char* buffer, int size) {
+  int offset = 0;
+  int bytes = 0;
+  while (offset < size) {
+    bytes = recv(fd, buffer + offset, size - offset, 0);
+    if (bytes == 0) {
+      // closed by client, maybe probing alive client
+      return 0;
+    }
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        return -1;
+      } else {
+        bytes = 0;
+      }
+    }
+    offset += bytes;
+  }
+  return offset;
+}
+
+static void BindOrConnectFailed(int timeout, int* try_times, int* total_time,
+                                const char* op, const std::string& ep) {
+  PADDLE_ENFORCE_LT(
+      *total_time, timeout,
+      platform::errors::Unavailable("%s addr=%s timeout, failed reason: %s", op,
+                                    ep.c_str(), strerror(errno)));
+  ++(*try_times);
+  int retry_time = std::min(*try_times * 500, 3000);  // max 3 seconds
+  *total_time += retry_time;
+
+  LOG(WARNING) << op << " addr=" << ep << " failed " << *try_times
+               << " times with reason: " << strerror(errno) << " retry after "
+               << retry_time / 1000.0 << " seconds";
+  std::this_thread::sleep_for(std::chrono::milliseconds(retry_time));
+}
+
+int CreateListenSocket(const std::string& ep) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+
+  // creating socket fd
+  int server_fd = -1;
+  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", server_fd);
+
+  // NOTE. Solutions to `Address already in use`.
+  // 1. Reuse addr&port. Otherwise, once the server closes the socket
+  // before client, the server will enter TIME-WAIT status. If we bind port
+  // again, the error `Address already in use` will appear.
+  // 2. Or we can close the client first to ensure that the server does
+  // not enter the TIME-WAIT state. But this is obviously not as convenient
+  // as the reuse method.
+  int opt = 1;
+#if defined(SO_REUSEPORT)
+  // since Linux kernel 3.9
+  CHECK_SYS_CALL(setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT,
+                            &opt, sizeof(opt)),
+                 "setsockopt");
+#else
+  CHECK_SYS_CALL(
+      setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)),
+      "setsockopt");
+#endif
+
+  struct sockaddr_in address;
+  address.sin_family = AF_INET;
+  address.sin_addr.s_addr = INADDR_ANY;
+  address.sin_port = htons(port);
+
+  // TODO(wangxi) Set from env, default 900s=15min
+  int timeout = 900 * 1000;
+  int try_times = 0;
+  int total_time = 0;
+  while (true) {
+    int ret_val = -1;
+    RETRY_SYS_CALL_VAL(
+        bind(server_fd, (struct sockaddr*)&address, sizeof(address)), "bind",
+        ret_val);
+
+    if (ret_val == -1) {
+      BindOrConnectFailed(timeout, &try_times, &total_time, "bind", ep);
+      continue;
+    }
+    break;
+  }
+
+  CHECK_SYS_CALL(listen(server_fd, 3), "listen");
+  LOG(INFO) << "Server listening on: " << ep << " successful.";
+  return server_fd;
+}
+
+void CloseSocket(int fd) { CHECK_SYS_CALL(close(fd), "close"); }
+
+static int SocketAccept(int server_fd, const char* head) {
+  struct sockaddr_in client_addr;
+  socklen_t addr_length = sizeof(client_addr);
+  char buffer[1024] = {0};
+  int conn = -1;
+
+  while (true) {
+    CHECK_SYS_CALL_VAL(
+        accept(server_fd, reinterpret_cast<struct sockaddr*>(&client_addr),
+               &addr_length),
+        "accept", conn);
+
+    int ret_val = SocketRecv(conn, buffer, strlen(head));
+    if (ret_val > 0 && strncmp(buffer, head, strlen(head)) == 0) {
+      break;  // accept client
+    } else {
+      VLOG(3) << "socket read failed with ret_val=" << ret_val;
+      CloseSocket(conn);
+    }
+  }
+  return conn;
+}
+
+static int ConnectAddr(const std::string& ep, const char* head) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+
+  int sock = -1;
+  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock);
+
+  struct sockaddr_in server_addr;
+  memset(&server_addr, 0, sizeof(server_addr));
+  server_addr.sin_family = AF_INET;
+  server_addr.sin_port = htons(port);
+
+  char* ip = NULL;
+  struct hostent* hp = NULL;
+  hp = gethostbyname(host.c_str());
+  PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument(
+                                  "Fail to get host by name %s.", host));
+
+  int i = 0;
+  while (hp->h_addr_list[i] != NULL) {
+    ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]);
+    VLOG(3) << "gethostbyname  host:" << host << "  ->ip: " << ip;
+    break;
+  }
+
+  PADDLE_ENFORCE_GT(inet_pton(AF_INET, ip, &server_addr.sin_addr), 0,
+                    platform::errors::Unavailable("Open address %s failed: %s",
+                                                  ep, strerror(errno)));
+
+  // TODO(wangxi) Set from env, default 900s=15min
+  int timeout = 900 * 1000;
+  int try_times = 0;
+  int total_time = 0;
+  while (true) {
+    int ret_val = -1;
+    RETRY_SYS_CALL_VAL(
+        connect(sock, (struct sockaddr*)&server_addr, sizeof(server_addr)),
+        "connect", ret_val);
+
+    if (ret_val == -1) {
+      BindOrConnectFailed(timeout, &try_times, &total_time, "connect", ep);
+      continue;
+    }
+
+    CHECK_SYS_CALL(SocketSend(sock, head, strlen(head)), "send");
+    break;
+  }
+  return sock;
+}
+
+static void RecvHCCLID(int conn, HcclRootInfo* hccl_id) {
+  char buffer[1024] = {0};
+  static_assert(HCCL_UNIQUE_ID_BYTES <= 1024,
+                "hccl id bytes must <= buffer size");
+
+  CHECK_SYS_CALL(SocketRecv(conn, buffer, HCCL_UNIQUE_ID_BYTES),
+                 "recv hccl id");
+  memcpy(hccl_id, buffer, HCCL_UNIQUE_ID_BYTES);
+}
+
+static void SendHCCLID(int conn, HcclRootInfo* hccl_id) {
+  char buffer[1024] = {0};
+  memcpy(buffer, hccl_id, HCCL_UNIQUE_ID_BYTES);
+
+  CHECK_SYS_CALL(SocketSend(conn, buffer, HCCL_UNIQUE_ID_BYTES),
+                 "send hccl id");
+}
+
+void SendBroadCastHCCLID(std::vector<std::string> servers, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  // connect with server
+  std::vector<int> connects;
+  for (auto server : servers) {
+    VLOG(3) << "connecting endpoint: " << server;
+    int conn = ConnectAddr(server, COMM_HEAD);
+    connects.push_back(conn);
+  }
+  VLOG(3) << "connecting completed...";
+
+  for (int i = 0; i < hccl_comm_num; ++i) {
+    std::string var_name = func(i);
+    auto var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable with name %s is not found",
+                                        var_name.c_str()));
+    auto hccl_id = var->GetMutable<HcclRootInfo>();
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(hccl_id));
+
+    int j = 0;
+    for (auto conn : connects) {
+      VLOG(3) << "sending hccl_id_var: " << var_name << " to " << servers[j]
+              << " hccl_comm_no: " << i;
+      SendHCCLID(conn, hccl_id);
+      ++j;
+    }
+    VLOG(3) << "sending completed...";
+  }
+
+  // close client
+  for (auto conn : connects) {
+    CloseSocket(conn);
+  }
+}
+
+void RecvBroadCastHCCLID(std::string endpoint, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  int server = CreateListenSocket(endpoint);
+  RecvBroadCastHCCLID(server, endpoint, hccl_comm_num, func, scope);
+  CloseSocket(server);
+}
+
+void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  int client = SocketAccept(server_fd, COMM_HEAD);
+
+  for (int i = 0; i < hccl_comm_num; ++i) {
+    std::string var_name = func(i);
+    auto var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable with name %s is not found",
+                                        var_name.c_str()));
+    auto hccl_id = var->GetMutable<HcclRootInfo>();
+
+    VLOG(3) << "trainer: " << endpoint << " receiving hccl_id_var: " << var_name
+            << " from trainer 0, hccl_comm_no: " << i;
+    RecvHCCLID(client, hccl_id);
+  }
+  VLOG(3) << "receiving completed...";
+  CloseSocket(client);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
new file mode 100644
index 00000000000000..1ad6f791e1fc34
--- /dev/null
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+int CreateListenSocket(const std::string& ep);
+
+void CloseSocket(int fd);
+
+void SendBroadCastHCCLID(std::vector<std::string> servers, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+
+// server listen on endpoint, then recv nccl id
+void RecvBroadCastHCCLID(std::string endpoint, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+
+// recv nccl id from socket
+void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc
index 0ae7b821617f91..39a9ed0c74ef59 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cc
@@ -70,6 +70,12 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0);
     AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.")
         .SetDefault(5);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
+        .SetDefault(0);
+#endif
     AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.")
         .SetDefault(std::vector<int>());
     AddAttr<bool>(
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
new file mode 100644
index 00000000000000..69f1f4681a33d6
--- /dev/null
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/recv_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CRecvOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto x = ctx.Output<framework::LoDTensor>("Out");
+    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    int numel = x->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int nranks = comm->nranks();
+    int peer = ctx.Attr<int>("peer");
+
+    PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
+                                     "The nranks must be 2, but (%d)", nranks));
+
+    int root = peer;
+
+    VLOG(3) << "begin hccl recv, parameter is: "
+            << "root " << root << ", comm: " << comm->comm()
+            << ", stream: " << stream;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
+        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(recv_v2, ops::CRecvOpASCENDKernel<int>,
+                       ops::CRecvOpASCENDKernel<int8_t>,
+                       ops::CRecvOpASCENDKernel<float>,
+                       ops::CRecvOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
new file mode 100644
index 00000000000000..384dfd1fc5f2d3
--- /dev/null
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -0,0 +1,165 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/operators/collective/recv_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(recv_v2);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(recv_v2, NPU);
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+
+  int num = atoi(getenv("DATA_SIZE"));
+  EXPECT_GT(num, 0);
+  EXPECT_LT(num, 1 << 15);
+  int rank_id = atoi(getenv("RANK_ID"));
+  VLOG(3) << "rank_id:" << rank_id << std::endl;
+
+  ctx.Wait();
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Data");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num, num});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("srtest");
+  attrs["peer"] = atoi(getenv("SRC_RANK"));
+  attrs["ring_id"] = 0;
+  attrs["srTag"] = 0;
+  std::vector<int> out_shape;
+  out_shape.push_back(num);
+  out_shape.push_back(num);
+  attrs["out_shape"] = out_shape;
+
+  auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Data"}}}, attrs);
+  VLOG(3) << "CreateOp recv_v2";
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  VLOG(3) << "Run op recv_v2";
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
+  EXPECT_EQ(out_vec == init, true);
+}
+
+TEST(recv_v2, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  char* npu_id = getenv("FLAGS_selected_npus");
+  VLOG(3) << "Select npu:" << npu_id;
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHcomRecvOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/send_v2_op.cc b/paddle/fluid/operators/collective/send_v2_op.cc
index c5a86b4f08813a..c60d560e43baed 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cc
@@ -50,6 +50,12 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
         .SetDefault(0);
     AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
+        .SetDefault(0);
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc
new file mode 100644
index 00000000000000..0ade090fcaac07
--- /dev/null
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/send_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSendOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    int numel = x->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int nranks = comm->nranks();
+    int rank = comm->rank();
+
+    PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
+                                     "The nranks must be 2, but (%d)", nranks));
+
+    int root = rank;
+
+    VLOG(3) << "begin hccl send, parameter is: "
+            << "root " << root << ", comm: " << comm->comm()
+            << ", stream: " << stream;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
+        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(send_v2, ops::CSendOpASCENDKernel<int>,
+                       ops::CSendOpASCENDKernel<int8_t>,
+                       ops::CSendOpASCENDKernel<float>,
+                       ops::CSendOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
new file mode 100644
index 00000000000000..cf01b1d0a6a1d1
--- /dev/null
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/operators/collective/send_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(send_v2);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(send_v2, NPU);
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  int num = atoi(getenv("DATA_SIZE"));
+
+  EXPECT_GT(num, 0);
+  EXPECT_LT(num, 1 << 15);
+  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
+  int rank_id = atoi(getenv("RANK_ID"));
+  VLOG(3) << "rank id:" << rank_id;
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num, num});
+  ctx.Wait();
+  auto place = ctx.GetPlace();
+  ctx.Wait();
+
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("srtest");
+  attrs["peer"] = atoi(getenv("DEST_RANK"));
+  attrs["ring_id"] = 0;
+  attrs["srTag"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"Data"}}}, {}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  VLOG(3) << "send run over";
+  ctx.Wait();
+}
+
+TEST(send_v2, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  char* npu_id = getenv("FLAGS_selected_npus");
+  VLOG(3) << "Select npu:" << npu_id;
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHcomSendOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 3cad86d96c26a0..bf047de86fc21a 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -23,29 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Functor>
-class CompareOpKernel<platform::CPUDeviceContext, Functor>
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    int axis = context.Attr<int>("axis");
-
-    if (x->numel() == 1 && y->numel() == 1) {
-      bool* z_data = z->mutable_data<bool>(context.GetPlace());
-      z_data[0] = Functor()(x->data<T>()[0], y->data<T>()[0]);
-    } else {
-      ElementwiseComputeEx<Functor, platform::CPUDeviceContext, T, bool>(
-          context, x, y, axis, Functor(), z);
-    }
-  }
-};
-
 template <typename OpComment>
 class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -153,16 +130,22 @@ class CompareOp : public framework::OperatorWithKernel {
   REGISTER_COMPARE_OP_VERSION(op_type);
 
 REGISTER_COMPARE_OP(less_than, "Out = X < Y");
-REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
+REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor,
+                        paddle::operators::GreaterEqualFunctor);
 REGISTER_COMPARE_OP(less_equal, "Out = X <= Y");
-REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
+REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor,
+                        paddle::operators::GreaterThanFunctor);
 REGISTER_COMPARE_OP(greater_than, "Out = X > Y");
 REGISTER_COMPARE_KERNEL(greater_than, CPU,
-                        paddle::operators::GreaterThanFunctor);
+                        paddle::operators::GreaterThanFunctor,
+                        paddle::operators::LessEqualFunctor);
 REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y");
 REGISTER_COMPARE_KERNEL(greater_equal, CPU,
-                        paddle::operators::GreaterEqualFunctor);
+                        paddle::operators::GreaterEqualFunctor,
+                        paddle::operators::LessThanFunctor);
 REGISTER_COMPARE_OP(equal, "Out = X == Y");
-REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
+REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor,
+                        paddle::operators::EqualFunctor);
 REGISTER_COMPARE_OP(not_equal, "Out = X != Y");
-REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor);
+REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor,
+                        paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu
index b1f30635835976..3ca700e16e6e7b 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_op.cu
@@ -14,11 +14,17 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/controlflow/compare_op.h"
 
-REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
-REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
-REGISTER_COMPARE_KERNEL(greater_than, CUDA,
+REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor,
+                        paddle::operators::GreaterEqualFunctor);
+REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor,
                         paddle::operators::GreaterThanFunctor);
+REGISTER_COMPARE_KERNEL(greater_than, CUDA,
+                        paddle::operators::GreaterThanFunctor,
+                        paddle::operators::LessEqualFunctor);
 REGISTER_COMPARE_KERNEL(greater_equal, CUDA,
-                        paddle::operators::GreaterEqualFunctor);
-REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
-REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor);
+                        paddle::operators::GreaterEqualFunctor,
+                        paddle::operators::LessThanFunctor);
+REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor,
+                        paddle::operators::EqualFunctor);
+REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor,
+                        paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h
index b7529e4ae632d3..ff929ee7dfce79 100644
--- a/paddle/fluid/operators/controlflow/compare_op.h
+++ b/paddle/fluid/operators/controlflow/compare_op.h
@@ -68,7 +68,7 @@ struct NotEqualFunctor {
   }
 };
 
-template <typename DeviceContext, typename Functor>
+template <typename DeviceContext, typename Functor, typename InverseFunctor>
 class CompareOpKernel
     : public framework::OpKernel<typename Functor::ELEM_TYPE> {
  public:
@@ -80,21 +80,33 @@ class CompareOpKernel
     auto* y = context.Input<Tensor>("Y");
     auto* z = context.Output<Tensor>("Out");
     int axis = context.Attr<int>("axis");
-    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
-                                                          Functor(), z);
+
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    if (x_dims.size() >= y_dims.size()) {
+      ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
+                                                            Functor(), z);
+    } else {
+      ElementwiseComputeEx<InverseFunctor, DeviceContext, T, bool>(
+          context, x, y, axis, InverseFunctor(), z);
+    }
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-#define REGISTER_COMPARE_KERNEL(op_type, dev, functor)                    \
-  REGISTER_OP_##dev##_KERNEL(                                             \
-      op_type, ::paddle::operators::CompareOpKernel<                      \
-                   ::paddle::platform::dev##DeviceContext, functor<int>>, \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,      \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<float>>,        \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<double>>);
+#define REGISTER_COMPARE_KERNEL(op_type, dev, functor, inverse_functor)       \
+  REGISTER_OP_##dev##_KERNEL(op_type,                                         \
+                             ::paddle::operators::CompareOpKernel<            \
+                                 ::paddle::platform::dev##DeviceContext,      \
+                                 functor<int>, inverse_functor<int>>,         \
+                             ::paddle::operators::CompareOpKernel<            \
+                                 ::paddle::platform::dev##DeviceContext,      \
+                                 functor<int64_t>, inverse_functor<int64_t>>, \
+                             ::paddle::operators::CompareOpKernel<            \
+                                 ::paddle::platform::dev##DeviceContext,      \
+                                 functor<float>, inverse_functor<float>>,     \
+                             ::paddle::operators::CompareOpKernel<            \
+                                 ::paddle::platform::dev##DeviceContext,      \
+                                 functor<double>, inverse_functor<double>>);
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index ab535e341f7575..7fdb1ccfe9614f 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -699,24 +699,51 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
 
     // ------------------- cudnn conv backward data ---------------------
     ScalingParamType<T> alpha = 1.0f;
+#ifdef PADDLE_WITH_HIP
+    // MIOPEN ONLY support beta to be 0.0f
+    ScalingParamType<T> beta = 0.0f;
+#else
     ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+#endif
     VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr<bool>("use_addto");
 
     if (input_grad) {
 // When beta is 0, it is unnecessary to reset input_grad.
 // When beta is 1, the output cannot be reset since addt strategy used.
 #ifdef PADDLE_WITH_HIP
-      workspace_handle.RunFunc(
-          [&](void* cudnn_workspace_ptr) {
-            PADDLE_ENFORCE_CUDA_SUCCESS(
-                platform::dynload::miopenConvolutionBackwardData(
-                    handle, &alpha, args1.odesc.desc(), output_grad_data,
-                    args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
-                    data_algo, &beta, args1.idesc.desc(),
-                    transformed_input_grad_data, cudnn_workspace_ptr,
-                    workspace_size));
-          },
-          workspace_size);
+      if (ctx.Attr<bool>("use_addto")) {
+        Tensor temp_tensor(transformed_input_grad.type());
+        temp_tensor.Resize(transformed_input_grad.dims());
+        T* temp_tensor_data = temp_tensor.mutable_data<T>(ctx.GetPlace());
+        workspace_handle.RunFunc(
+            [&](void* cudnn_workspace_ptr) {
+              PADDLE_ENFORCE_CUDA_SUCCESS(
+                  platform::dynload::miopenConvolutionBackwardData(
+                      handle, &alpha, args1.odesc.desc(), output_grad_data,
+                      args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
+                      data_algo, &beta, args1.idesc.desc(), temp_tensor_data,
+                      cudnn_workspace_ptr, workspace_size));
+            },
+            workspace_size);
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor(
+            handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(),
+            transformed_input_grad_data, &alpha, args1.idesc.desc(),
+            temp_tensor_data, &beta, args1.idesc.desc(),
+            transformed_input_grad_data));
+      } else {
+        workspace_handle.RunFunc(
+            [&](void* cudnn_workspace_ptr) {
+              PADDLE_ENFORCE_CUDA_SUCCESS(
+                  platform::dynload::miopenConvolutionBackwardData(
+                      handle, &alpha, args1.odesc.desc(), output_grad_data,
+                      args1.wdesc.desc(), filter_data, args1.cdesc.desc(),
+                      data_algo, &beta, args1.idesc.desc(),
+                      transformed_input_grad_data, cudnn_workspace_ptr,
+                      workspace_size));
+            },
+            workspace_size);
+      }
+
 #else
       for (int i = 0; i < groups; i++) {
         workspace_handle.RunFunc(
diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h
index 3ab27e1ec4f4fc..befe09c8e6beb3 100644
--- a/paddle/fluid/operators/conv_miopen_helper.h
+++ b/paddle/fluid/operators/conv_miopen_helper.h
@@ -146,28 +146,8 @@ struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
               cudnn_workspace_ptr, workspace_size, false));
     };
 
-    if (!exhaustive_search && !deterministic) {
-      workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
-      algo = find_result.fwd_algo;
-    } else {
-      auto& temp = ctx.cuda_device_context();
-      AlgorithmsCache<algo_t>& algo_cache =
-          *(framework::ConvSearchCache::Instance().GetForward());
-
-      auto x_dims = framework::vectorize(args.x->dims());
-      auto w_dims = framework::vectorize(args.w->dims());
-
-      VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:"
-               << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
-               << args.s << ", args.p" << args.p << ", args.d" << args.d;
-
-      algo = algo_cache.GetAlgorithm(
-          x_dims, w_dims, args.s, args.p, args.d, 0,
-          static_cast<int64_t>(args.cudnn_dtype), [&]() {
-            workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
-            return find_result.fwd_algo;
-          });
-    }
+    workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
+    algo = find_result.fwd_algo;
     VLOG(3) << "choose algo " << algo;
     return algo;
   }
@@ -208,27 +188,8 @@ struct SearchAlgorithm<miopenConvBwdDataAlgorithm_t> {
               cudnn_workspace_ptr, workspace_size, false));
     };
 
-    if (!exhaustive_search && !deterministic) {
-      workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
-      algo = find_result.bwd_data_algo;
-    } else {
-      AlgorithmsCache<algo_t>& algo_cache =
-          *(framework::ConvSearchCache::Instance().GetBackwardData());
-
-      auto x_dims = framework::vectorize(args.x->dims());
-      auto w_dims = framework::vectorize(args.w->dims());
-
-      VLOG(10) << "miopenConvolutionFwdAlgoPerf_t"
-               << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
-               << args.s << ", args.p" << args.p << ", args.d" << args.d;
-
-      algo = algo_cache.GetAlgorithm(
-          x_dims, w_dims, args.s, args.p, args.d, 0,
-          static_cast<int64_t>(args.cudnn_dtype), [&]() {
-            workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
-            return find_result.bwd_data_algo;
-          });
-    }
+    workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
+    algo = find_result.bwd_data_algo;
     VLOG(3) << "choose algo " << algo;
     return algo;
   }
@@ -269,27 +230,8 @@ struct SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t> {
               cudnn_workspace_ptr, workspace_size, false));
     };
 
-    if (!exhaustive_search && !deterministic) {
-      workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
-      algo = find_result.bwd_weights_algo;
-    } else {
-      AlgorithmsCache<algo_t>& algo_cache =
-          *(framework::ConvSearchCache::Instance().GetBackwardFilter());
-
-      auto x_dims = framework::vectorize(args.x->dims());
-      auto w_dims = framework::vectorize(args.w->dims());
-
-      VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:"
-               << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s"
-               << args.s << ", args.p" << args.p << ", args.d" << args.d;
-
-      algo = algo_cache.GetAlgorithm(
-          x_dims, w_dims, args.s, args.p, args.d, 0,
-          static_cast<int64_t>(args.cudnn_dtype), [&]() {
-            workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
-            return find_result.bwd_weights_algo;
-          });
-    }
+    workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
+    algo = find_result.bwd_weights_algo;
     VLOG(3) << "choose algo " << algo;
     return algo;
   }
diff --git a/paddle/fluid/operators/copy_cross_scope_op.cc b/paddle/fluid/operators/copy_cross_scope_op.cc
new file mode 100644
index 00000000000000..721354954c7035
--- /dev/null
+++ b/paddle/fluid/operators/copy_cross_scope_op.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
+using LoDTensor = paddle::framework::LoDTensor;
+using Tensor = paddle::framework::Tensor;
+
+namespace paddle {
+namespace operators {
+
+class CopyCrossScopeOp : public framework::OperatorBase {
+ public:
+  CopyCrossScopeOp(const std::string& type,
+                   const framework::VariableNameMap& inputs,
+                   const framework::VariableNameMap& outputs,
+                   const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext* ctx) const {}
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    int num_micro_scopes = scope.kids().size();
+    int num_micro_batches = Attr<int>("num_micro_batches");
+    bool ToM = Attr<bool>("to_main_scope");
+    PADDLE_ENFORCE_EQ(num_micro_scopes, num_micro_batches,
+                      platform::errors::InvalidArgument(
+                          "For pipeline, number of micro scopes (%d) should "
+                          "be equal to number of micro batches (%d).",
+                          num_micro_scopes, num_micro_batches));
+    const std::string& id_name = Input("Id");
+    auto* id_var = scope.FindVar(id_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        id_var,
+        platform::errors::NotFound("No variable with name %s found.", id_name));
+    auto id_tensor = id_var->GetMutable<LoDTensor>();
+    auto it = scope.kids().begin();
+    framework::Tensor cpu_id_tensor;
+    TensorCopySync(*id_tensor, platform::CPUPlace(), &cpu_id_tensor);
+    auto id_value = cpu_id_tensor.data<int64_t>();
+    for (auto i = 0; i < *id_value; i++) {
+      it++;
+    }
+    if (it == scope.kids().end()) {
+      if (ToM) {
+        auto dst_scope = *it;
+        const std::string& x_name = Input("X");
+        auto* dst_var = dst_scope->FindVar(x_name);
+        PADDLE_ENFORCE_NOT_NULL(
+            dst_var,
+            platform::errors::NotFound(
+                "No variable with name %s found in source scope.", x_name));
+        auto* main_var = scope.FindVar(x_name);
+        PADDLE_ENFORCE_NOT_NULL(
+            main_var,
+            platform::errors::NotFound(
+                "No variable with name %s found in destination scope.",
+                x_name));
+        auto dst_tensor = dst_var->GetMutable<LoDTensor>();
+        auto main_tensor = main_var->GetMutable<LoDTensor>();
+        TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor);
+      }
+      return;
+    }
+    auto source_scope = *it;
+    it++;
+    auto dst_scope = *it;
+    const std::string& x_name = Input("X");
+    auto* source_var = source_scope->FindVar(x_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        source_var,
+        platform::errors::NotFound(
+            "No variable with name %s found in source scope.", x_name));
+    auto* dst_var = dst_scope->FindVar(x_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        dst_var,
+        platform::errors::NotFound(
+            "No variable with name %s found in destination scope.", x_name));
+    auto src_tensor = source_var->GetMutable<LoDTensor>();
+    auto dst_tensor = dst_var->GetMutable<LoDTensor>();
+    TensorCopySync(*src_tensor, dst_tensor->place(), dst_tensor);
+
+    if (ToM) {
+      auto* main_var = scope.FindVar(x_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          main_var,
+          platform::errors::NotFound(
+              "No variable with name %s found in destination scope.", x_name));
+      auto main_tensor = main_var->GetMutable<LoDTensor>();
+      TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor);
+    }
+  }
+};
+
+class CopyCrossScopeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), The first input tensor of copy_cross_scope op, which "
+             "is copying micro scope.");
+    AddInput("Id",
+             "(Tensor), The second input tensor of copy_cross_scope op, which "
+             "is a id of the current micro scope.");
+    AddAttr<bool>("to_main_scope", "Return current scope to main scope.")
+        .SetDefault(false);
+    AddAttr<int>("num_micro_batches", "Number of micro batches for pipeline.");
+    AddComment(R"DOC(
+      This op is used by pipeline to copy tensors across micro batch scopes. 
+      Copy the variable value of the giving Id's micro scope to the micro scope of Id + 1 position. 
+      If need to copy back to the main scope, using to_main_scope option to copy the variable value of 
+      the current micro scope to the main scope.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_WITHOUT_GRADIENT(copy_cross_scope, ops::CopyCrossScopeOp,
+                             ops::CopyCrossScopeOpMaker);
diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc
new file mode 100644
index 00000000000000..e175b235f9c181
--- /dev/null
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <iostream>
+#include <list>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/copy_cross_scope_op.cc"
+#include "paddle/fluid/string/printf.h"
+
+#define Conn(x, y) x##y
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+USE_NO_KERNEL_OP(copy_cross_scope);
+
+template <typename T>
+void Compare1(f::Scope* scope, const p::DeviceContext& ctx,
+              std::string op_type) {
+  // init
+  auto var_x = scope->Var("tmp");
+  auto x = var_x->GetMutable<f::LoDTensor>();
+  std::vector<T> main_x = {1.0};
+  TensorFromVector(main_x, ctx, x);
+
+  auto var_id = scope->Var("Id");
+  auto id = var_id->GetMutable<f::LoDTensor>();
+  std::vector<int64_t> main_id = {1};
+  TensorFromVector(main_id, ctx, id);
+  for (int i = 0; i < 3; i++) {
+    auto& child_scope = scope->NewScope();
+    auto child_var = child_scope.Var("tmp");
+    auto tensor_x = child_var->GetMutable<f::LoDTensor>();
+    std::vector<T> init_x = {static_cast<T>(i)};
+    TensorFromVector(init_x, ctx, tensor_x);
+  }
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs = {{"to_main_scope", false}, {"num_micro_batches", 3}};
+  std::map<std::string, std::vector<std::string>> output;
+  auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}},
+                                    output, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::list<f::Scope*>::const_iterator iter = scope->kids().begin();
+  iter++;
+  iter++;
+
+  auto* kid_scope = *iter;
+  auto* dst_var = kid_scope->FindVar("tmp");
+  auto* tensor_out = dst_var->GetMutable<f::LoDTensor>();
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  int expected = 1;
+  EXPECT_EQ(static_cast<int>(out_vec[0]), expected);
+}
+
+template <typename T>
+void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
+              std::string op_type) {
+  // init
+  auto var_x = scope->Var("tmp");
+  auto x = var_x->GetMutable<f::LoDTensor>();
+  std::vector<T> main_x = {1.0};
+  TensorFromVector(main_x, ctx, x);
+
+  auto var_id = scope->Var("Id");
+  auto id = var_id->GetMutable<f::LoDTensor>();
+  std::vector<int64_t> main_id = {0};
+  TensorFromVector(main_id, ctx, id);
+  for (int i = 0; i < 3; i++) {
+    auto& child_scope = scope->NewScope();
+    auto child_var = child_scope.Var("tmp");
+    auto tensor_x = child_var->GetMutable<f::LoDTensor>();
+    std::vector<T> init_x = {static_cast<T>(i)};
+    TensorFromVector(init_x, ctx, tensor_x);
+  }
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs = {{"to_main_scope", true}, {"num_micro_batches", 3}};
+  std::map<std::string, std::vector<std::string>> output;
+  auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}},
+                                    output, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  auto* dst_var = scope->FindVar("tmp");
+  auto* tensor_out = dst_var->GetMutable<f::LoDTensor>();
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  int expected = 0;
+  EXPECT_EQ(static_cast<int>(out_vec[0]), expected);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(copy_cross_scope, CUDA_fp32) {
+  f::Scope scope;
+  p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  Compare1<float>(&scope, ctx, "copy_cross_scope");
+}
+
+TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
+  f::Scope scope;
+  p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  Compare2<float>(&scope, ctx, "copy_cross_scope");
+}
+#elif PADDLE_WITH_ASCEND_CL
+TEST(copy_cross_scope, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare1<float>(&scope, ctx, "copy_cross_scope");
+}
+
+TEST(copy_cross_scope_to_main_scope, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare2<float>(&scope, ctx, "copy_cross_scope");
+}
+#endif
diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu
index a51fce8132418b..9b08f875bb6e6d 100644
--- a/paddle/fluid/operators/correlation_op.cu
+++ b/paddle/fluid/operators/correlation_op.cu
@@ -12,17 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_WITH_HIP
-// HIP not supported yet
-
 #include <algorithm>
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
 
+#ifdef __HIPCC__
+#define __syncwarp() __all(1)
+#endif
+
 namespace paddle {
 namespace operators {
 
+#ifdef __HIPCC__
+#define THREADS_PER_BLOCK 64
+#else
 #define THREADS_PER_BLOCK 32
+#endif
 #define FULL_MASK 0xffffffff
 
 using framework::Tensor;
@@ -30,14 +35,22 @@ using framework::Tensor;
 template <typename T>
 __forceinline__ __device__ T warpReduceSum(T val) {
   for (int offset = 16; offset > 0; offset /= 2) {
+#ifdef __HIPCC__
+    val += __shfl_down(val, offset);
+#else
     val += __shfl_down_sync(FULL_MASK, val, offset);
+#endif
   }
   return val;
 }
 
 template <typename T>
 __forceinline__ __device__ T blockReduceSum(T val) {
+#ifdef __HIPCC__
+  static __shared__ T shared[64];
+#else
   static __shared__ T shared[32];
+#endif
   int lane = threadIdx.x % warpSize;
   int wid = threadIdx.x / warpSize;
 
@@ -483,5 +496,3 @@ REGISTER_OP_CUDA_KERNEL(correlation, ops::CorrelationCUDAKernel<float>,
                         ops::CorrelationCUDAKernel<double>);
 REGISTER_OP_CUDA_KERNEL(correlation_grad, ops::CorrelationCUDAGradKernel<float>,
                         ops::CorrelationCUDAGradKernel<double>);
-
-#endif  // not PADDLE_WITH_HIP
diff --git a/paddle/fluid/operators/decode_jpeg_op.cc b/paddle/fluid/operators/decode_jpeg_op.cc
new file mode 100644
index 00000000000000..e553b1076a8640
--- /dev/null
+++ b/paddle/fluid/operators/decode_jpeg_op.cc
@@ -0,0 +1,114 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/dynload/nvjpeg.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CPUDecodeJpegKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // TODO(LieLinJiang): add cpu implement.
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "DecodeJpeg op only supports GPU now."));
+  }
+};
+
+class DecodeJpegOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DecodeJpeg");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DecodeJpeg");
+
+    auto mode = ctx->Attrs().Get<std::string>("mode");
+    std::vector<int> out_dims;
+
+    if (mode == "unchanged") {
+      out_dims = {-1, -1, -1};
+    } else if (mode == "gray") {
+      out_dims = {1, -1, -1};
+    } else if (mode == "rgb") {
+      out_dims = {3, -1, -1};
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "The provided mode is not supported for JPEG files on GPU: ", mode));
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const {
+    if (var_name == "X") {
+      return expected_kernel_type;
+    }
+
+    return framework::OpKernelType(tensor.type(), tensor.place(),
+                                   tensor.layout());
+  }
+};
+
+class DecodeJpegOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "A one dimensional uint8 tensor containing the raw bytes "
+             "of the JPEG image. It is a tensor with rank 1.");
+    AddOutput("Out", "The output tensor of DecodeJpeg op");
+    AddComment(R"DOC(
+This operator decodes a JPEG image into a 3 dimensional RGB Tensor 
+or 1 dimensional Gray Tensor. Optionally converts the image to the 
+desired format. The values of the output tensor are uint8 between 0 
+and 255.
+)DOC");
+    AddAttr<std::string>(
+        "mode",
+        "(string, default \"unchanged\"), The read mode used "
+        "for optionally converting the image, can be \"unchanged\" "
+        ",\"gray\" , \"rgb\" .")
+        .SetDefault("unchanged");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    decode_jpeg, ops::DecodeJpegOp, ops::DecodeJpegOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
+
+REGISTER_OP_CPU_KERNEL(decode_jpeg, ops::CPUDecodeJpegKernel<uint8_t>)
diff --git a/paddle/fluid/operators/decode_jpeg_op.cu b/paddle/fluid/operators/decode_jpeg_op.cu
new file mode 100644
index 00000000000000..35975a6a549867
--- /dev/null
+++ b/paddle/fluid/operators/decode_jpeg_op.cu
@@ -0,0 +1,138 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PADDLE_WITH_HIP
+
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/dynload/nvjpeg.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/stream/cuda_stream.h"
+
+namespace paddle {
+namespace operators {
+
+static cudaStream_t nvjpeg_stream = nullptr;
+static nvjpegHandle_t nvjpeg_handle = nullptr;
+
+void InitNvjpegImage(nvjpegImage_t* img) {
+  for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) {
+    img->channel[c] = nullptr;
+    img->pitch[c] = 0;
+  }
+}
+
+template <typename T>
+class GPUDecodeJpegKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // Create nvJPEG handle
+    if (nvjpeg_handle == nullptr) {
+      nvjpegStatus_t create_status =
+          platform::dynload::nvjpegCreateSimple(&nvjpeg_handle);
+
+      PADDLE_ENFORCE_EQ(create_status, NVJPEG_STATUS_SUCCESS,
+                        platform::errors::Fatal("nvjpegCreateSimple failed: ",
+                                                create_status));
+    }
+
+    nvjpegJpegState_t nvjpeg_state;
+    nvjpegStatus_t state_status =
+        platform::dynload::nvjpegJpegStateCreate(nvjpeg_handle, &nvjpeg_state);
+
+    PADDLE_ENFORCE_EQ(state_status, NVJPEG_STATUS_SUCCESS,
+                      platform::errors::Fatal("nvjpegJpegStateCreate failed: ",
+                                              state_status));
+
+    int components;
+    nvjpegChromaSubsampling_t subsampling;
+    int widths[NVJPEG_MAX_COMPONENT];
+    int heights[NVJPEG_MAX_COMPONENT];
+
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* x_data = x->data<T>();
+
+    nvjpegStatus_t info_status = platform::dynload::nvjpegGetImageInfo(
+        nvjpeg_handle, x_data, (size_t)x->numel(), &components, &subsampling,
+        widths, heights);
+
+    PADDLE_ENFORCE_EQ(
+        info_status, NVJPEG_STATUS_SUCCESS,
+        platform::errors::Fatal("nvjpegGetImageInfo failed: ", info_status));
+
+    int width = widths[0];
+    int height = heights[0];
+
+    nvjpegOutputFormat_t output_format;
+    int output_components;
+
+    auto mode = ctx.Attr<std::string>("mode");
+    if (mode == "unchanged") {
+      if (components == 1) {
+        output_format = NVJPEG_OUTPUT_Y;
+        output_components = 1;
+      } else if (components == 3) {
+        output_format = NVJPEG_OUTPUT_RGB;
+        output_components = 3;
+      } else {
+        platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state);
+        PADDLE_THROW(platform::errors::Fatal(
+            "The provided mode is not supported for JPEG files on GPU"));
+      }
+    } else if (mode == "gray") {
+      output_format = NVJPEG_OUTPUT_Y;
+      output_components = 1;
+    } else if (mode == "rgb") {
+      output_format = NVJPEG_OUTPUT_RGB;
+      output_components = 3;
+    } else {
+      platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state);
+      PADDLE_THROW(platform::errors::Fatal(
+          "The provided mode is not supported for JPEG files on GPU"));
+    }
+
+    nvjpegImage_t out_image;
+    InitNvjpegImage(&out_image);
+
+    // create nvjpeg stream
+    if (nvjpeg_stream == nullptr) {
+      cudaStreamCreateWithFlags(&nvjpeg_stream, cudaStreamNonBlocking);
+    }
+
+    int sz = widths[0] * heights[0];
+
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    std::vector<int64_t> out_shape = {output_components, height, width};
+    out->Resize(framework::make_ddim(out_shape));
+
+    T* data = out->mutable_data<T>(ctx.GetPlace());
+
+    for (int c = 0; c < output_components; c++) {
+      out_image.channel[c] = data + c * sz;
+      out_image.pitch[c] = width;
+    }
+
+    nvjpegStatus_t decode_status = platform::dynload::nvjpegDecode(
+        nvjpeg_handle, nvjpeg_state, x_data, x->numel(), output_format,
+        &out_image, nvjpeg_stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(decode_jpeg, ops::GPUDecodeJpegKernel<uint8_t>)
+
+#endif
diff --git a/paddle/fluid/operators/diag_embed_op.cu b/paddle/fluid/operators/diag_embed_op.cu
index 2e03622e10f0f4..7e3ab6be664cb9 100644
--- a/paddle/fluid/operators/diag_embed_op.cu
+++ b/paddle/fluid/operators/diag_embed_op.cu
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/diag_embed_op.h"
 
diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h
index a2279e40623b4b..6a34ef48a169dc 100644
--- a/paddle/fluid/operators/dist_op.h
+++ b/paddle/fluid/operators/dist_op.h
@@ -167,6 +167,7 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
   auto sign =
       (x_minux_y > static_cast<T>(0)).template cast<T>() * static_cast<T>(1.0) +
       (x_minux_y < static_cast<T>(0)).template cast<T>() * static_cast<T>(-1.0);
+  T epsilon = static_cast<T>(1.0e-10f);
 
   // 1: Lp-norm(z), z = x-y, compute dz
   if (p == 0) {
@@ -189,12 +190,14 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
     // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
     if (platform::is_cpu_place(context.GetPlace())) {
       grad_t.device(place) =
-          (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) *
+          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
+              .pow(p - 1) *
           sign.eval() * out_grad_t.broadcast(out_bcast_dims);
     } else {
       grad_t.device(place) =
-          (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
-          out_grad_t.broadcast(out_bcast_dims);
+          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
+              .pow(p - 1) *
+          sign * out_grad_t.broadcast(out_bcast_dims);
     }
   }
 
diff --git a/paddle/fluid/operators/dlnne/CMakeLists.txt b/paddle/fluid/operators/dlnne/CMakeLists.txt
new file mode 100644
index 00000000000000..4fe9cf214eaa70
--- /dev/null
+++ b/paddle/fluid/operators/dlnne/CMakeLists.txt
@@ -0,0 +1,54 @@
+# compile flags
+set(DLNNE_FLAGS
+  -Wno-error=non-virtual-dtor
+  -Wno-error=unused-variable
+  -Wno-error=attributes
+  ${fsanitize}
+)
+foreach(flag ${DLNNE_FLAGS})
+  safe_set_cflag(CMAKE_C_FLAGS ${flag})
+  safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
+endforeach()
+
+
+# add nne
+find_path(DLNNE_INCLUDE_DIR dlnne.h
+  PATHS
+  $ENV{SOFTWARE_SOURCE_DIR} $ENV{SOFTWARE_SOURCE_DIR}/driver/nne/include
+  NO_DEFAULT_PATH
+)
+
+find_library(DLNNE_LIB libdlnne.so
+  PATHS
+  $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/driver/nne
+  NO_DEFAULT_PATH
+)
+
+find_path(CUDA_INCLUDE_DIR cuda.h
+  $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/include
+)
+
+find_library(CURT_LIB libcurt.so
+  PATHS
+  $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/lib
+  NO_DEFAULT_PATH
+)
+
+
+message("DLNNE_INCLUDE_DIR: "${DLNNE_INCLUDE_DIR})
+message("DLNNE_LIB: "${DLNNE_LIB})
+message("CUDA_INCLUDE_DIR: "${CUDA_INCLUDE_DIR})
+message("CURT_LIB: "${CURT_LIB})
+
+include_directories("${DLNNE_INCLUDE_DIR}")
+include_directories("${CUDA_INCLUDE_DIR}")
+
+op_library(dlnne_engine_op DEPS ${GLOB_OPERATOR_DEPS} framework_proto boost device_context op_registry scope)
+
+#message("PYBIND_FILE:${pybind_file}")
+#file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(dlnne_engine);\n")
+#endif()
+
+target_link_libraries(dlnne_engine_op ${DLNNE_LIB} ${CURT_LIB})
+
+cc_test(test_dlnne_engine_op SRCS dlnne_engine_op_test.cc DEPS dlnne_engine_op analysis)
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op.cc
new file mode 100644
index 00000000000000..4654e6a9f978a2
--- /dev/null
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dlnne/dlnne_engine_op.h"
+
+namespace paddle {
+namespace inference {
+
+void CopyTensorDeviceToCpu(void* dst_ptr, void* src_ptr, int total_bytes) {
+  cudaDeviceSynchronize();
+  cudaMemcpy(dst_ptr, src_ptr, total_bytes, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+}
+void CopyTensorCpuToDevice(void* dst_ptr, void* src_ptr, int total_bytes) {
+  cudaDeviceSynchronize();
+  cudaMemcpy(dst_ptr, src_ptr, total_bytes, cudaMemcpyHostToDevice);
+  cudaDeviceSynchronize();
+}
+
+}  // namespace inference
+
+namespace operators {
+
+class DlnneEngineOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Xs", "A list of inputs.").AsDuplicable();
+    AddOutput("Ys", "A list of outputs").AsDuplicable();
+    AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<std::string>(
+        "engine_key",
+        "The engine_key here is used to distinguish different DLNNE Engines");
+    AddAttr<framework::BlockDesc*>("sub_block", "the trt block");
+    AddComment("Dlnne engine operator.");
+  }
+};
+
+class DlnneEngineInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(dlnne_engine, ops::DlnneEngineOp, ops::DlnneEngineOpMaker);
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.h b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
new file mode 100644
index 00000000000000..d426876c18fa5e
--- /dev/null
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
@@ -0,0 +1,351 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda.h>          // NOTLINT
+#include <cuda_runtime.h>  // NOTLINT
+#include <dlnne.h>         // NOTLINT
+
+#include <assert.h>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+
+namespace dl {
+namespace nne {
+class Builder;
+class Engine;
+class Network;
+class Parser;
+class ExecutionContext;
+}  // namespace nne
+}  // namespace dl
+
+namespace paddle {
+namespace inference {
+class NneDeleter {
+ public:
+  NneDeleter() {}
+
+  template <typename T>
+  inline void operator()(T *ptr) {
+    if (ptr != nullptr) {
+      ptr->Destroy();
+    }
+  }
+};
+
+void CopyTensorDeviceToCpu(void *dst_ptr, void *src_ptr, int total_bytes);
+
+void CopyTensorCpuToDevice(void *dst_ptr, void *src_ptr, int total_bytes);
+
+template <typename T>
+struct Singleton;
+}  // namespace inference
+}  // namespace paddle
+
+namespace paddle {
+
+namespace operators {
+
+class DlnneEngineOp : public framework::OperatorBase {
+ private:
+  std::vector<std::string> input_names_;
+  std::unordered_set<std::string> param_names_;
+  std::string engine_key_;
+  int num_inputs;
+  int num_outputs;
+  std::vector<std::string> output_names;
+  std::vector<std::string> input_names;
+
+  dl::nne::Builder *builder;
+  dl::nne::Parser *parser;
+  dl::nne::Network *network;
+  dl::nne::ExecutionContext *context;
+  dl::nne::Engine *engine;
+
+  unsigned int engine_input_size;
+  std::vector<int> InputIndexToBindIndex_;
+
+ public:
+  DlnneEngineOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {
+    input_names_ = Inputs("Xs");
+    engine_key_ = Attr<std::string>("engine_key");
+    auto params = Attr<std::vector<std::string>>("parameters");
+    for (const auto &param : params) {
+      param_names_.insert(param);
+    }
+
+    num_inputs = 0;
+    for (const auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      num_inputs += 1;
+      input_names.push_back(x);
+    }
+
+    num_outputs = Outputs("Ys").size();
+    for (const auto &y : Outputs("Ys")) {
+      VLOG(4) << "y: " << y << std::endl;
+      output_names.push_back(y);
+    }
+
+    // onnx path
+    std::stringstream filename;
+    std::string current_path = ".";
+    char *buffer;
+    if ((buffer = getcwd(NULL, 0)) != NULL) {
+      current_path = buffer;
+    } else {
+      current_path = ".";
+    }
+    filename << current_path << "/dump/" << engine_key_ << "/" << engine_key_
+             << ".onnx";
+
+    builder = dl::nne::CreateInferBuilder();
+    PADDLE_ENFORCE_NE(builder, nullptr, platform::errors::Unavailable(
+                                            "nne create builder failed"));
+    parser = dl::nne::CreateParser();
+    PADDLE_ENFORCE_NE(parser, nullptr, platform::errors::Unavailable(
+                                           "nne create parser failed"));
+
+    network = builder->CreateNetwork();
+
+    LOG(INFO) << "set output for dlnne";
+    for (std::string &output_op_name : output_names)
+      parser->RegisterOutput(output_op_name.c_str());
+
+    LOG(INFO) << "parser onnx for dlnne";
+    parser->Parse(filename.str().c_str(), *network);
+
+    LOG(INFO) << "build network";
+    engine = builder->BuildEngine(*network);
+
+    // total size = input_size+output_size
+    engine_input_size = num_inputs + num_outputs;
+    for (std::string &input_name : input_names) {
+      int BindIndex = engine->GetBindingIndex(input_name.c_str());
+      InputIndexToBindIndex_.push_back(BindIndex);
+    }
+
+    for (std::string &output_name : output_names) {
+      int BindIndex = engine->GetBindingIndex(output_name.c_str());
+      InputIndexToBindIndex_.push_back(BindIndex);
+    }
+
+    // context
+    context = engine->CreateExecutionContext();
+  }
+
+  ~DlnneEngineOp() {
+    network->Destroy();
+    context->Destroy();
+    engine->Destroy();
+    parser->Destroy();
+    builder->Destroy();
+  }
+
+ protected:
+  void RunDlnneOnCreateEngine(const framework::Scope &scope,
+                              const platform::Place &dev_place) const {
+    PADDLE_ENFORCE_EQ(
+        input_names_.empty(), false,
+        platform::errors::PreconditionNotMet(
+            "Dlnne engine needs at least one input, but no input is found. "
+            "Please check if you set the input correctly."));
+
+    std::vector<void *> input_buffers(num_inputs);
+    std::vector<void *> cpu_input_buffers(num_inputs);
+    std::vector<std::vector<int64_t>> input_shapes(num_inputs);
+    std::vector<int32_t> input_data_types(num_inputs);
+    std::vector<int64_t> input_bytes(num_inputs);
+
+    int index = 0;
+    for (const auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      // convert input and copy to Dlnne engine's buffer
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+
+      const int bind_index = index;
+      index++;
+      int64_t data_bytes;
+      int32_t dtype;
+      auto type = t.type();
+      data_bytes = 1;
+      void *buffer = nullptr;
+      if (type == framework::proto::VarType::FP32) {
+        buffer = static_cast<void *>(t.data<float>());
+        data_bytes = 4;
+        dtype = 0;
+      } else if (type == framework::proto::VarType::INT64) {
+        buffer = static_cast<void *>(t.data<int64_t>());
+        data_bytes = 8;
+        dtype = 1;
+      } else if (type == framework::proto::VarType::INT32) {
+        buffer = static_cast<void *>(t.data<int32_t>());
+        data_bytes = 4;
+        dtype = 2;
+      } else {
+        PADDLE_THROW(platform::errors::Fatal(
+            "The DLNNE Engine OP only support float/int32_t/int64_t input."));
+      }
+      input_buffers[bind_index] = buffer;
+
+      auto t_shape = framework::vectorize<int64_t>(t.dims());
+      std::vector<int64_t> runtime_input_shape(t_shape.begin(), t_shape.end());
+      for (auto &size : t_shape) {
+        data_bytes = data_bytes * size;
+      }
+
+      VLOG(4) << "buffers_size:" << data_bytes;
+      cpu_input_buffers[bind_index] =
+          input_buffers[bind_index];  // malloc(data_bytes);
+      input_shapes[bind_index] = runtime_input_shape;
+      input_data_types[bind_index] = dtype;
+      input_bytes[bind_index] = data_bytes;
+    }
+
+    // output shape
+    std::vector<std::vector<int64_t>> out_shapes;
+    std::vector<int32_t> output_bytes;
+    for (int i = 0; i < num_outputs; i++) {
+      int index = engine->GetBindingIndex(output_names[i].c_str());
+      dl::nne::Dims out_dim = engine->GetBindingDimensions(index);
+      std::vector<int64_t> shape(out_dim.nbDims);
+      for (int dim = 0; dim < out_dim.nbDims; dim++) {
+        shape[dim] = (out_dim.d[dim]);
+      }
+
+      out_shapes.push_back(shape);
+      int64_t data_bytes;
+
+      // float32
+      data_bytes = 4;
+      for (auto &size : shape) {
+        data_bytes = data_bytes * size;
+      }
+      VLOG(4) << "data_bytes: " << data_bytes;
+      output_bytes.push_back(data_bytes);
+    }
+
+    int bind_index = 0;
+    std::vector<void *> cpu_output_buffers(num_outputs);
+    std::vector<void *> output_buffers(num_outputs);
+    std::vector<int32_t> output_dtypes(num_outputs);
+
+    for (const auto &y : Outputs("Ys")) {
+      auto *fluid_v = scope.FindVar(y);
+      PADDLE_ENFORCE_NOT_NULL(
+          fluid_v,
+          platform::errors::NotFound(
+              "Output variable %s is not found in DLNNE subgraph.", y));
+
+      auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
+
+      VLOG(4) << "out_shapes[bind_index] dim:" << out_shapes[bind_index].size();
+      fluid_t->Resize(framework::make_ddim(out_shapes[bind_index]));
+
+      int32_t dtype;
+      output_buffers[bind_index] = fluid_t->mutable_data<float>(
+          BOOST_GET_CONST(platform::CPUPlace, dev_place));
+      dtype = 0;
+      cpu_output_buffers[bind_index] =
+          output_buffers[bind_index];  // malloc(data_bytes);
+      output_dtypes[bind_index] = dtype;
+      bind_index++;
+    }
+
+    std::vector<void *> engine_input_ptr(engine_input_size);
+
+    // set input_ptr
+    for (unsigned int i = 0; i < engine_input_size; i++) {
+      if (InputIndexToBindIndex_[i] < 0) continue;
+
+      if (engine->BindingIsInput(InputIndexToBindIndex_[i])) {
+        // copy cpu buffer to gpu buffer
+        int64_t total_bytes;
+        total_bytes = input_bytes[i];
+        VLOG(4) << "input_bytes: " << total_bytes;
+
+        void *gpu_ptr;
+        cudaMalloc(&gpu_ptr, total_bytes);
+        engine_input_ptr[InputIndexToBindIndex_[i]] = gpu_ptr;
+
+        paddle::inference::CopyTensorCpuToDevice(
+            gpu_ptr, reinterpret_cast<void *>(cpu_input_buffers[i]),
+            total_bytes);
+
+      } else {
+        int64_t total_size;
+        total_size = output_bytes[i - input_names.size()];
+        VLOG(4) << "output_bytes: " << total_size;
+        void *gpu_ptr;
+        cudaMalloc(&gpu_ptr, total_size);
+        engine_input_ptr[InputIndexToBindIndex_[i]] = gpu_ptr;
+      }
+    }
+
+    clock_t startTime, endTime;
+    startTime = clock();
+    context->Execute(1, engine_input_ptr.data());
+    endTime = clock();
+    double during_ms =
+        static_cast<double>(endTime - startTime) / CLOCKS_PER_SEC * 1000;
+    LOG(INFO) << "dlNNE execute time: " << during_ms << " ms";
+
+    bind_index = 0;
+    for (unsigned int i = 0; i < engine_input_size; i++) {
+      if (InputIndexToBindIndex_[i] < 0) continue;
+
+      if (i >= input_names.size()) {
+        void *cpu_ptr = cpu_output_buffers[i - input_names.size()];
+        int64_t size;
+        size = output_bytes[i - input_names.size()];
+        paddle::inference::CopyTensorDeviceToCpu(
+            cpu_ptr, engine_input_ptr[InputIndexToBindIndex_[i]], size);
+        // dtype: float32
+        int32_t dtypes;
+        dtypes = 0;
+
+        cpu_output_buffers[bind_index] = cpu_ptr;
+        output_dtypes[bind_index] = dtypes;
+        bind_index++;
+      }
+      cudaFree(engine_input_ptr[InputIndexToBindIndex_[i]]);
+    }
+  }
+
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    RunDlnneOnCreateEngine(scope, dev_place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
new file mode 100644
index 00000000000000..caf1a80fcc737f
--- /dev/null
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
@@ -0,0 +1,237 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dlnne/dlnne_engine_op.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
+
+USE_NO_KERNEL_OP(dlnne_engine);
+namespace paddle {
+namespace operators {
+
+namespace {
+void CreateCUDATensor(framework::Scope* scope, const std::string& name,
+                      const std::vector<int64_t>& shape) {
+  auto* var = scope->Var(name);
+  auto* tensor = var->GetMutable<framework::LoDTensor>();
+  auto dims = framework::make_ddim(shape);
+  tensor->Resize(dims);
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+  inference::tensorrt::RandomizeTensor(tensor, place, ctx);
+}
+
+void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
+                          const std::string& name,
+                          const std::vector<int64_t>& shape) {
+  using framework::proto::VarType;
+  auto* var = block->add_vars();
+  framework::VarDesc desc(name);
+  desc.SetType(VarType::LOD_TENSOR);
+  desc.SetDataType(VarType::FP32);
+  desc.SetShape(shape);
+  *var = *desc.Proto();
+}
+
+}  // namespace
+
+using inference::analysis::SetAttr;
+
+TEST(DlnneEngineOp, manual) {
+  framework::ProgramDesc program;
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+  LOG(INFO) << "create fc op";
+  auto* fc0 = block_desc.AppendOp();
+  fc0->SetType("fc");
+  fc0->SetInput("X", std::vector<std::string>({"x"}));     // 4 x 1 x 1
+  fc0->SetInput("Y", std::vector<std::string>({"y"}));     // 4 x 6
+  fc0->SetOutput("Out", std::vector<std::string>({"z"}));  // 6 x 1 x 1
+
+  LOG(INFO) << "create fc op";
+  auto* fc1 = block_desc.AppendOp();
+  fc1->SetType("fc");
+  fc1->SetInput("X", std::vector<std::string>({"z"}));
+  fc1->SetInput("Y", std::vector<std::string>({"y0"}));     // 6 x 8
+  fc1->SetOutput("Out", std::vector<std::string>({"z0"}));  // 8 x 1 x 1
+
+  // Set inputs' variable shape in BlockDesc
+  // the batch size is 2, so the dims of 'x' is {2, 4, 1, 1}
+  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4, 1, 1}));
+  AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
+  AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
+  AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
+
+  // It is wired, need to copy manually.
+  *block_->add_ops() = *fc0->Proto();
+  *block_->add_ops() = *fc1->Proto();
+
+  ASSERT_EQ(block_->ops_size(), 2);
+
+  LOG(INFO) << "create dlnne desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("dlnne_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
+
+  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
+  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(2));
+  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
+  engine_op_desc.SetAttr("parameters", std::vector<std::string>({}));
+  engine_op_desc.SetAttr("engine_key", std::string("a_engine"));
+  engine_op_desc.SetAttr("calibration_engine_key",
+                         std::string("a_calib_engine"));
+  engine_op_desc.SetAttr("predictor_id", 1);
+  engine_op_desc.SetAttr("calibration_data", std::string(""));
+  engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
+  engine_op_desc.SetAttr("enable_fp16", static_cast<bool>(false));
+  engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
+  engine_op_desc.SetAttr("output_name_mapping",
+                         std::vector<std::string>({"z0"}));
+  engine_op_desc.SetAttr("origin_output_dims", std::vector<int>({2}));
+  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
+  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
+  int device_id = 0;
+  engine_op_desc.SetAttr("gpu_id", device_id);
+
+  LOG(INFO) << "create engine op";
+  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
+  LOG(INFO) << "engine_op " << engine_op.get();
+
+  framework::Scope scope;
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+  // Prepare variables.
+  CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4}));
+  CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
+  CreateCUDATensor(&scope, "z", std::vector<int64_t>({2, 6}));
+
+  CreateCUDATensor(&scope, "y0", std::vector<int64_t>({6, 8}));
+  CreateCUDATensor(&scope, "z0", std::vector<int64_t>({2, 8}));
+
+  // Execute them.
+  LOG(INFO) << "engine_op run";
+  engine_op->Run(scope, place);
+}
+
+void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  using shape_t = std::vector<int64_t>;
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+
+  auto AddFCLayer = [&](const std::string& x_name, const std::string& y_name,
+                        const std::string& z_name, bool x_created,
+                        const shape_t& x_shape, const shape_t& y_shape,
+                        const shape_t& z_shape) {
+    LOG(INFO) << "create fc op";
+    auto* fc = block_desc.AppendOp();
+    fc->SetType("mul");
+    fc->SetInput("X", std::vector<std::string>({x_name}));
+    fc->SetInput("Y", std::vector<std::string>({y_name}));
+    fc->SetOutput("Out", std::vector<std::string>({z_name}));
+
+    // Set inputs' variable shape in BlockDesc
+    if (!x_created) {
+      AddTensorToBlockDesc(block_, x_name,
+                           std::vector<int64_t>({batch_size, input_dim, 1, 1}));
+    }
+    AddTensorToBlockDesc(block_, y_name,
+                         std::vector<int64_t>({input_dim, output_dim}));
+    AddTensorToBlockDesc(block_, z_name,
+                         std::vector<int64_t>({batch_size, output_dim}));
+
+    // Prepare variables.
+    if (!x_created) {
+      CreateCUDATensor(&scope, x_name, std::vector<int64_t>(x_shape));
+    }
+    CreateCUDATensor(&scope, y_name, std::vector<int64_t>(y_shape));
+    CreateCUDATensor(&scope, z_name, std::vector<int64_t>(z_shape));
+
+    // It is wired, need to copy manually.
+    *block_->add_ops() = *fc->Proto();
+  };
+
+  // Test with 4 layer FC
+  AddFCLayer("x0", "y0", "z0", false, {batch_size, input_dim},
+             {input_dim, output_dim}, {batch_size, output_dim});
+  AddFCLayer("z0", "y1", "z1", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z1", "y2", "z2", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z2", "y3", "z3", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+
+  LOG(INFO) << "create dlnne desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("dlnne_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
+
+  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
+  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(batch_size));
+  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
+  engine_op_desc.SetAttr("parameters",
+                         std::vector<std::string>({"y0", "y1", "y2", "y3"}));
+  engine_op_desc.SetAttr("engine_key", std::string("b_engine"));
+  engine_op_desc.SetAttr("calibration_engine_key",
+                         std::string("b_calib_engine"));
+  engine_op_desc.SetAttr("predictor_id", 1);
+  engine_op_desc.SetAttr("calibration_data", std::string(""));
+  engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
+  engine_op_desc.SetAttr("enable_fp16", static_cast<bool>(false));
+  engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
+  engine_op_desc.SetAttr("output_name_mapping",
+                         std::vector<std::string>({"z3"}));
+  engine_op_desc.SetAttr("origin_output_dims", std::vector<int>({2}));
+  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
+  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
+  int device_id = 0;
+  engine_op_desc.SetAttr("gpu_id", device_id);
+
+  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
+
+  // Execute them.
+  engine_op->Run(scope, place);
+}
+
+// Test with a larger FC layer.
+TEST(DlnneEngineOp, fc) { Execute(40, 28, 28); }
+
+}  // namespace operators
+}  // namespace paddle
+
+USE_TRT_CONVERTER(fc)
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 1b607922eda1d8..0987118ba39b6e 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -205,35 +205,25 @@ struct DotGradFunction<DeviceContext, T, math::DisableComplex<T>> {
       }
     }
 #else
-    const auto* data_dout = tensor_dout->data<T>();
+    auto const *x = tensor_x->data<T>(), *y = tensor_y->data<T>(),
+               *dz = tensor_dout->data<T>();
+    auto&& d = tensor_x->dims();
+    auto const N = tensor_x->numel();
+    auto const B = d[d.size() - 1];
 
     if (tensor_dx) {
-      auto* data_dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
-      const auto* data_y = tensor_y->data<T>();
-      const framework::DDim& dim = tensor_x->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-
-      auto step = dim[dim.size() - 1];
-
-      int s = -1;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_dx[i] = data_y[i] * data_dout[s];
+      auto* dx = tensor_dx->mutable_data<T>(ctx.GetPlace());
+      for (auto j = 0; j < N / B; ++j) {
+        auto const ss = dz[j];
+        for (auto i = 0; i < B; ++i) *dx++ = *y++ * ss;
       }
     }
 
     if (tensor_dy) {
-      auto* data_dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
-      const auto* data_x = tensor_x->data<T>();
-      const framework::DDim& dim = tensor_y->dims();
-      size_t N = static_cast<size_t>(framework::product(dim));
-
-      auto step = dim[dim.size() - 1];
-
-      int s = -1;
-      for (size_t i = 0; i < N; ++i) {
-        if (0 == i % step) ++s;
-        data_dy[i] = data_x[i] * data_dout[s];
+      auto* dy = tensor_dy->mutable_data<T>(ctx.GetPlace());
+      for (auto j = 0; j < N / B; ++j) {
+        auto const ss = dz[j];
+        for (auto i = 0; i < B; i++) *dy++ = *x++ * ss;
       }
     }
 #endif
@@ -266,21 +256,20 @@ class DotKernel : public framework::OpKernel<T> {
       out.device(dev) = (x * y).sum(Eigen::DSizes<int, 1>(1));
     }
 #else
-    const auto* data_x = tensor_x->data<T>();
-    const auto* data_y = tensor_y->data<T>();
-    auto* data_out = tensor_out->data<T>();
-
-    auto x_dims = tensor_x->dims();
-    auto step = x_dims[x_dims.size() - 1];
-    int size = static_cast<int>(framework::product(x_dims));
-
-    for (int ind = -1, j = 0; j < size; ++j) {
-      if (j % step == 0) {
-        ++ind;
-        data_out[ind] = data_x[j] * data_y[j];
-      } else {
-        data_out[ind] += data_x[j] * data_y[j];
-      }
+    auto const *x = tensor_x->data<T>(), *x_ = &x[0];
+    auto const *y = tensor_y->data<T>(), *y_ = &y[0];
+    auto* z = tensor_out->data<T>();
+
+    // Loop over the total N elements of both operands while sum-reducing every
+    // B pairs along the way where B is the dimension of the least ordered axis
+    auto&& d = tensor_x->dims();
+    auto const N = tensor_x->numel();
+    auto const B = d[d.size() - 1];
+
+    for (int j = 0; j < N / B; j++) {
+      T ss = 0;
+      for (int i = 0; i < B; i++) ss += (*x_++) * (*y_++);
+      z[j] = ss;
     }
 #endif
   }
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 0ca03fc32fbf67..5c444e752e7975 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
@@ -34,7 +33,9 @@ namespace operators {
 */
 template <typename T>
 struct CudaAddFunctor {
-  inline HOSTDEVICE T operator()(T args[]) const { return args[0] + args[1]; }
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] + args[1];
+  }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
index 5b8d08a8943dde..3768748931ded2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@@ -100,9 +100,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
                                   {{"axes", axes}, {"keep_dims", true}});
         runner.Run(stream);
       } else {
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .Wait();
-        framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
+        framework::TensorCopy(
+            *tmp_dout, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), dx);
       }
     }
 
@@ -127,8 +127,6 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
                                   {{"axes", axes}, {"keep_dims", false}});
         runner.Run(stream);
         tmp_dout = &reduced_dout;
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .Wait();
       }
 
       // stage 2
@@ -144,9 +142,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
                                   {{"axes", axes}, {"keep_dims", true}});
         runner.Run(stream);
       } else {
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .Wait();
-        framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dy);
+        framework::TensorCopy(
+            *tmp_dout, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), dy);
       }
     }
   }
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 36add2112974dc..321826ec647c99 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -13,6 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+
+#ifdef __HIPCC__
+#define ELEMENTWISE_BLOCK_SIZE 256
+#else
+#define ELEMENTWISE_BLOCK_SIZE 512
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -90,8 +101,7 @@ struct ElementwiseDataWrapper {
 
 template <ElementwiseType ET, int VecSize, typename T, typename Functor>
 __device__ void VectorizedKernelImpl(
-    ElementwiseDataWrapper<ET, VecSize, T> data, int size, Functor func,
-    int tid) {
+    ElementwiseDataWrapper<ET, VecSize, T> data, Functor func, int tid) {
   using VecType = CudaAlignedVector<T, VecSize>;
   VecType ins_vec[ET];
   VecType out_vec;
@@ -121,10 +131,9 @@ __device__ void VectorizedKernelImpl(
   data.store_vector(out_vec, tid);
 }
 
-template <ElementwiseType ET, typename T, typename Functor>
-__device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, 1, T> data,
-                                 int size, Functor func, int start,
-                                 int remain) {
+template <ElementwiseType ET, int VecSize, typename T, typename Functor>
+__device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, VecSize, T> data,
+                                 Functor func, int start, int remain) {
   T ins[ET];
   T out;
 
@@ -146,12 +155,11 @@ __global__ void VectorizedKernel(const T *__restrict__ in0,
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int remain = size - VecSize * tid;
   remain = remain > 0 ? remain : 0;
+  auto data = ElementwiseDataWrapper<ET, VecSize, T>(out, in0, in1);
   if (remain >= VecSize) {
-    auto data = ElementwiseDataWrapper<ET, VecSize, T>(out, in0, in1);
-    VectorizedKernelImpl(data, size, func, tid);
+    VectorizedKernelImpl(data, func, tid);
   } else {
-    auto data = ElementwiseDataWrapper<ET, 1, T>(out, in0, in1);
-    ScalarKernelImpl(data, size, func, tid * VecSize, remain);
+    ScalarKernelImpl(data, func, tid * VecSize, remain);
   }
 }
 
@@ -162,7 +170,7 @@ __global__ void ScalarKernel(const T *__restrict__ in0,
   auto data = ElementwiseDataWrapper<ET, 1, T>(out, in0, in1);
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int remain = tid < size ? 1 : 0;
-  ScalarKernelImpl(data, size, func, tid, remain);
+  ScalarKernelImpl(data, func, tid, remain);
 }
 
 template <ElementwiseType ET, typename T, typename Functor>
@@ -173,7 +181,7 @@ void LaunchElementwiseCudaKernel(
   // calculate the max vec_size for all ins and outs
   auto size = ins[0]->numel();
   int vec_size = GetVectorizedSize<T>(ins, *outs);
-  int block_size = PADDLE_CUDA_THREAD_SIZE;
+  int block_size = ELEMENTWISE_BLOCK_SIZE;
   int grid_size =
       ((size + vec_size - 1) / vec_size + block_size - 1) / block_size;
   const T *in0 = ins[0]->data<T>();
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index df6fae6c8484a0..f06dbd26873a60 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -38,7 +38,7 @@ USE_OP(elementwise_sub);
 USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
 
 template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+void Compare(f::Scope *scope, const p::DeviceContext &ctx,
              std::string op_type) {
   // init
   auto x = scope->Var("X");
@@ -62,8 +62,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   TensorFromVector(init_y, ctx, tensor_y);
   tensor_y->Resize({10, 10});
 
-  ctx.Wait();
-
   auto place = ctx.GetPlace();
   auto out = scope->Var("Out");
   auto tensor_out = out->GetMutable<f::LoDTensor>();
@@ -74,7 +72,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
                                     {{"Out", {"Out"}}}, attrs);
 
   op->Run(*scope, place);
-  ctx.Wait();
 
   std::vector<T> out_vec;
   TensorToVector(*tensor_out, ctx, &out_vec);
@@ -93,7 +90,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
 }
 
 template <typename T>
-void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
+void CompareGrad(f::Scope *scope, const p::DeviceContext &ctx,
                  std::string op_type) {
   // init
   auto dout = scope->Var("DOut");
@@ -122,8 +119,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
   TensorFromVector(init_dout, ctx, tensor_dout);
   tensor_dout->Resize({2, 3, 5});
 
-  ctx.Wait();
-
   // run
   f::AttributeMap attrs;
   auto op = f::OpRegistry::CreateOp(
@@ -132,7 +127,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
 
   auto place = ctx.GetPlace();
   op->Run(*scope, place);
-  ctx.Wait();
 
   std::vector<T> dx_vec;
   TensorToVector(*tensor_dx, ctx, &dx_vec);
@@ -160,30 +154,30 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
 
 TEST(elementwise_add, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx, "elementwise_add");
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "elementwise_add");
 }
 
 TEST(elementwise_sub, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx, "elementwise_sub");
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "elementwise_sub");
 }
 
 TEST(elementwise_sub, NPU_fp16) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<p::float16>(&scope, ctx, "elementwise_sub");
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<p::float16>(&scope, *ctx, "elementwise_sub");
 }
 
 TEST(elementwise_sub_grad, NPU) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx, "elementwise_sub_grad");
 }
 
 TEST(elementwise_add_grad, NPU) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx, "elementwise_add_grad");
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx, "elementwise_add_grad");
 }
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
index 809445c2862035..a6e438f8016e0c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@@ -102,7 +102,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
                                   {{"axes", axes}, {"keep_dims", true}});
         runner.Run(stream);
       } else {
-        framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
+        framework::TensorCopy(
+            *tmp_dout, ctx.GetPlace(),
+            ctx.template device_context<platform::DeviceContext>(), dx);
       }
     }
     if (dy) {
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
index 25b83ed93f7296..e2bf61de63196b 100644
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -147,3 +147,17 @@ REGISTER_OP_CPU_KERNEL(
     ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, double>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    expand_as, ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_as_grad,
+    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, double>);
+#endif
diff --git a/paddle/fluid/operators/expand_as_op.cu b/paddle/fluid/operators/expand_as_op.cu
deleted file mode 100755
index dbb1fcf3ab3261..00000000000000
--- a/paddle/fluid/operators/expand_as_op.cu
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/expand_as_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    expand_as, ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_grad,
-    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
index 70099afbd5994d..5296a144f6247d 100644
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -129,3 +129,18 @@ REGISTER_OP_CPU_KERNEL(
     ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, double>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    expand_as_v2,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_as_v2_grad,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, double>);
+#endif
diff --git a/paddle/fluid/operators/expand_as_v2_op.cu b/paddle/fluid/operators/expand_as_v2_op.cu
deleted file mode 100644
index e315144472dd9f..00000000000000
--- a/paddle/fluid/operators/expand_as_v2_op.cu
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/expand_as_v2_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_v2,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_as_v2_grad,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandAsV2GradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 83e205367a7af6..e7da08ff277117 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -273,3 +273,21 @@ REGISTER_OP_CPU_KERNEL(
     ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext,
+                      paddle::platform::float16>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandKernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_grad,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext,
+                          paddle::platform::float16>,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/expand_op.cu b/paddle/fluid/operators/expand_op.cu
deleted file mode 100644
index f2f8e2f7414f38..00000000000000
--- a/paddle/fluid/operators/expand_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/expand_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_grad,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc
index f4ae1785b024f5..bb3a6512d2c8ba 100644
--- a/paddle/fluid/operators/expand_op_npu.cc
+++ b/paddle/fluid/operators/expand_op_npu.cc
@@ -58,9 +58,11 @@ class ExpandNPUKernel : public framework::OpKernel<T> {
             expand_times.size(), static_cast<size_t>(in_dims.size())));
     auto* out0 = context.Output<framework::LoDTensor>("Out");
     framework::DDim out_dims(in_dims);
+
     for (size_t i = 0; i < expand_times.size(); ++i) {
       out_dims[i] *= expand_times[i];
     }
+
     out0->Resize(out_dims);
     out0->mutable_data<T>(context.device_context().GetPlace());
     auto runner =
@@ -77,6 +79,7 @@ class ExpandNPUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_NPU_KERNEL(
     expand, ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, int>,
     ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext,
                          paddle::platform::float16>);
 
diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc
index 95f7865a8a3a4e..880eb341f2093b 100644
--- a/paddle/fluid/operators/expand_op_npu_test.cc
+++ b/paddle/fluid/operators/expand_op_npu_test.cc
@@ -69,6 +69,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(expand, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index 05ab0f6c8dc8fc..618c1560c5eac7 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -278,3 +278,21 @@ REGISTER_OP_CPU_KERNEL(
     ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    expand_v2, ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::float16>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_v2_grad,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext,
+                            paddle::platform::float16>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/expand_v2_op.cu b/paddle/fluid/operators/expand_v2_op.cu
deleted file mode 100644
index e096dbc27f0c2a..00000000000000
--- a/paddle/fluid/operators/expand_v2_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/expand_v2_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    expand_v2, ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    expand_v2_grad,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ExpandV2GradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index caa29309901932..f35d8b6bbf89f1 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -154,6 +154,7 @@ REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
                        ops::FillConstantKernel<int>,
                        ops::FillConstantKernel<bool>,
                        ops::FillConstantKernel<paddle::platform::float16>,
+                       ops::FillConstantKernel<paddle::platform::bfloat16>,
                        ops::FillConstantKernel<paddle::platform::complex64>,
                        ops::FillConstantKernel<paddle::platform::complex128>);
 
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index 4608f167548a38..17c7321122b174 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -105,7 +105,8 @@ class FillConstantKernel : public framework::OpKernel<T> {
     int actual_place = place_type;
 
     if (actual_place == -1) {
-      bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
+      bool cpu_place = (force_cpu || ctx.GetPlace() == platform::CPUPlace() ||
+                        data_type == framework::proto::VarType::BF16);
       if (cpu_place) {
         actual_place = 0;
       } else if (platform::is_gpu_place(ctx.GetPlace())) {
@@ -116,6 +117,9 @@ class FillConstantKernel : public framework::OpKernel<T> {
     }
 
     if (actual_place == 0) {
+      VLOG(4) << "[CPU] FillConstantKernel"
+              << ((data_type == framework::proto::VarType::BF16) ? "<bfloat16>"
+                                                                 : "<T>");
       tensor->mutable_data(platform::CPUPlace(), data_type);
       math::SetConstant<platform::CPUDeviceContext, T> functor;
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
index 9d5499e00c82f6..4ea4c11c478357 100644
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -65,7 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
 
     Tensor tensor_tmp(data_type);
     tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
-    TensorFromVector(std::vector<T>{value}, ctx.device_context(), &tensor_tmp);
+    FillNpuTensorWithConstant<T>(&tensor_tmp, value);
 
     out_var->mutable_data<T>(shape, place);
     auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var},
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index 1b2f1db1b07cdd..efcb0cbe2e2a8d 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -120,23 +120,9 @@ template <typename DeviceContext, typename T>
 class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
-    auto &start_axis = context.Attr<int>("start_axis");
-    auto &stop_axis = context.Attr<int>("stop_axis");
-
     auto *in = context.Input<framework::LoDTensor>("X");
-    auto x_dims = in->dims();
-    int in_dims_size = x_dims.size();
-    int real_start_axis = start_axis, real_stop_axis = stop_axis;
-    if (start_axis < 0) {
-      real_start_axis = start_axis + in_dims_size;
-    }
-    if (stop_axis < 0) {
-      real_stop_axis = stop_axis + in_dims_size;
-    }
     auto *out = context.Output<framework::LoDTensor>("Out");
-
-    auto out_dims = framework::make_ddim(
-        GetOutputShape(real_start_axis, real_stop_axis, x_dims));
+    auto out_dims = out->dims();
 
     out->mutable_data(context.GetPlace(), in->type());
     framework::TensorCopy(
@@ -144,27 +130,6 @@ class FlattenContiguousRangeKernel : public framework::OpKernel<T> {
         context.template device_context<platform::DeviceContext>(), out);
     out->Resize(out_dims);
   }
-  static std::vector<int32_t> GetOutputShape(const int start_axis,
-                                             const int stop_axis,
-                                             const framework::DDim &in_dims) {
-    int64_t outer = 1;
-    std::vector<int32_t> out_shape;
-    int in_dims_size = in_dims.size();
-    out_shape.reserve(in_dims_size - stop_axis + start_axis);
-
-    for (int i = 0; i < start_axis; ++i) {
-      out_shape.push_back(in_dims[i]);
-    }
-    for (int i = start_axis; i <= stop_axis; i++) {
-      outer *= in_dims[i];
-    }
-    out_shape.push_back(outer);
-    for (int i = stop_axis + 1; i < in_dims_size; i++) {
-      out_shape.push_back(in_dims[i]);
-    }
-
-    return out_shape;
-  }
 };
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 287827ced5115e..104298e037319c 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -32,8 +32,7 @@ if (WITH_GPU OR WITH_ROCM)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_batch_norm_act);\n")
     endif()
     # conv_fusion_op needs cudnn 7 above
-    # HIP not support cudnnConvolutionBiasActivationForward
-    if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100))
+    if (NOT ${CUDNN_VERSION} VERSION_LESS 7100)
         op_library(conv_fusion_op)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
     endif()
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index c9ba7a61e0907f..f5ee7f55991845 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -18,14 +18,18 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/operators/math/padding.h"
+#ifdef PADDLE_WITH_HIP
+#include "paddle/fluid/platform/miopen_helper.h"
+#else
 #include "paddle/fluid/platform/cudnn_helper.h"
+#endif
 
 DECLARE_int64(cudnn_exhaustive_search_times);
 
 namespace paddle {
 namespace operators {
 
-#if CUDNN_VERSION >= 7100
+#if PADDLE_WITH_HIP || CUDNN_VERSION >= 7100
 using Tensor = framework::Tensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
@@ -162,7 +166,78 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     if (input->dims().size() == 5) {
       layout = DataLayout::kNCDHW;
     }
+#ifdef PADDLE_WITH_HIP
+    miopenConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(padding_common, strides, dilations);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::miopenSetConvolutionGroupCount(cudnn_conv_desc,
+                                                          groups));
+    // Now only support NCHW
+    std::vector<int> bias_dim = {
+        1, static_cast<int>(transformed_output.dims()[1]), 1, 1};
+    miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize<int>(transformed_input.dims()));
+    miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize<int>(transformed_output.dims()));
+    miopenTensorDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
+        layout, framework::vectorize<int>(filter->dims()));
+    miopenTensorDescriptor_t cudnn_bias_desc =
+        bias_desc.descriptor<T>(layout, bias_dim);
+    miopenActivationDescriptor_t cudnn_act_desc =
+        act_desc.descriptor<T>(activation);
 
+    miopenConvFwdAlgorithm_t algo;
+    auto handle = dev_ctx.cudnn_handle();
+    auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+
+    auto x_dims = framework::vectorize(transformed_input.dims());
+    auto f_dims = framework::vectorize(filter->dims());
+
+    size_t workspace_size = 0;
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::miopenConvolutionForwardGetWorkSpaceSize(
+            handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
+            cudnn_output_desc, &workspace_size));
+    int find_count;
+    miopenConvAlgoPerf_t find_result;
+    auto cudnn_find_func = [&](void* cudnn_workspace_ptr) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::miopenFindConvolutionForwardAlgorithm(
+              handle, cudnn_input_desc, input_data, cudnn_filter_desc,
+              filter_data, cudnn_conv_desc, cudnn_output_desc, output_data,
+              kNUM_CUDNN_FWD_ALGS, &find_count, &find_result,
+              cudnn_workspace_ptr, workspace_size, false));
+    };
+    workspace_handle.RunFuncSync(cudnn_find_func, workspace_size);
+    algo = find_result.fwd_algo;
+    VLOG(3) << "cuDNN forward algo " << algo;
+
+    {
+      ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenConvolutionForward(
+            handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
+            filter_data, cudnn_conv_desc, algo, &beta, cudnn_output_desc,
+            output_data, cudnn_workspace, workspace_size));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size);
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::miopenConvolutionForwardBias(
+              handle, &alpha, cudnn_bias_desc, bias_data, &beta,
+              cudnn_output_desc, output_data));
+      if (activation != "identity") {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenActivationForward(
+            handle, cudnn_act_desc, &alpha, cudnn_output_desc, output_data,
+            &beta, cudnn_output_desc, output_data));
+      }
+      if (residual) {
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor(
+            handle, miopenTensorOpAdd, &alpha, cudnn_output_desc, output_data,
+            &alpha, cudnn_output_desc, residual_data, &beta, cudnn_output_desc,
+            output_data));
+      }
+    }
+#else  // PADDLE_WITH_HIP
     cudnnConvolutionDescriptor_t cudnn_conv_desc =
         conv_desc.descriptor<T>(padding_common, strides, dilations);
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -327,6 +402,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       };
       workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
     }
+#endif
     std::vector<int> channels = ctx.Attr<std::vector<int>>("split_channels");
     if (channels.size()) {
       auto outs = ctx.MultiOutput<framework::Tensor>("Outputs");
@@ -358,8 +434,11 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-#if CUDNN_VERSION >= 7100
 namespace ops = paddle::operators;
+#if CUDNN_VERSION >= 7100
 REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel<float>,
                         ops::CUDNNConvFusionOpKernel<double>);
 #endif
+#ifdef PADDLE_WITH_HIP
+REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel<float>);
+#endif
diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
index 8a487234ad94ac..1ee8889995f4d6 100644
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -50,6 +50,7 @@ class GatherGradOpNPUKernel : public framework::OpKernel<T> {
     auto *x = ctx.Input<Tensor>("X");
     auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
 
     // step1: Unsqueeze index
     framework::Tensor tmp_tensor(index->type());
@@ -66,7 +67,7 @@ class GatherGradOpNPUKernel : public framework::OpKernel<T> {
             .stream();
 
     // step2: ZerosLike x in device
-    Tensor zeroslike_xout(x->type());
+    Tensor zeroslike_xout(dx->type());
     zeroslike_xout.Resize(x->dims());
     auto p = zeroslike_xout.mutable_data<T>(ctx.GetPlace());
 
@@ -74,7 +75,6 @@ class GatherGradOpNPUKernel : public framework::OpKernel<T> {
                              zeroslike_xout.numel() * sizeof(T), stream);
 
     // step3: scatter(x_grad)
-    dx->mutable_data<T>(ctx.GetPlace());
     auto runner_scatter = NpuOpRunner(
         "TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {});
     runner_scatter.Run(stream);
diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
index de067e45585d91..31e19d8f600c39 100644
--- a/paddle/fluid/operators/gather_op_npu_test.cc
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
@@ -152,18 +152,18 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
 
 TEST(gather, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx, "gather");
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "gather");
 }
 
 TEST(gather, NPU_fp16) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<p::float16>(&scope, ctx, "gather");
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<p::float16>(&scope, *ctx, "gather");
 }
 
 TEST(gather_grad, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx, "gather_grad");
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx, "gather_grad");
 }
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 7a0c93eb1b2eaa..453ae20656f1d6 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -11,6 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include "paddle/fluid/framework/generator.h"
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
index f11812ce3bb219..830dcd59839015 100644
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
@@ -157,12 +157,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(gelu, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
 }
 
 TEST(gelu_grad, NPU) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc
index c1859bce02c904..7d75e385e8f3b7 100644
--- a/paddle/fluid/operators/increment_op_npu.cc
+++ b/paddle/fluid/operators/increment_op_npu.cc
@@ -39,10 +39,9 @@ class IncrementalNPUKernel : public framework::OpKernel<T> {
     out_tensor->mutable_data<T>(context.GetPlace());
 
     Tensor step_tensor(x_tensor->type());
-    std::vector<T> step_vec;
-    step_vec.push_back(static_cast<T>(step));
-    framework::TensorFromVector(step_vec, context.device_context(),
-                                &step_tensor);
+
+    step_tensor.mutable_data<T>({1}, context.GetPlace());
+    FillNpuTensorWithConstant<T>(&step_tensor, static_cast<T>(step));
 
     auto runner =
         NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {});
diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc
index b466ae275dd1c1..bde349b0a33b9d 100644
--- a/paddle/fluid/operators/increment_op_npu_test.cc
+++ b/paddle/fluid/operators/increment_op_npu_test.cc
@@ -71,12 +71,12 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
 
 TEST(increment, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx, "increment");
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "increment");
 }
 
 TEST(increment, NPU_fp64) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx, "increment");
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<double>(&scope, *ctx, "increment");
 }
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index 6c488c387f8150..445d129d07c14b 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -88,8 +88,11 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
         platform::errors::InvalidArgument(
             "OutSize's dimension size must be 1, but got dimention = %d .",
             out_size_dim.size()));
-    PADDLE_ENFORCE_EQ(out_size_dim[0], 1, platform::errors::InvalidArgument(
-                                              "OutSize's dim[0] must be 1"));
+    PADDLE_ENFORCE_EQ(
+        out_size_dim[0], 1,
+        platform::errors::InvalidArgument(
+            "OutSize's 0-th dimension's value must be 1, but got value = %d .",
+            out_size_dim[0]));
     ctx->ShareLoD("X", "Out");
     return;
   }
diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc
index cfbe1778c76646..a4353420c84a9a 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cc
+++ b/paddle/fluid/operators/interpolate_v2_op.cc
@@ -14,6 +14,9 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -73,9 +76,12 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
     if (scale.size() > 0) {
       float scale_w = -1;
       scale_w = scale[0];
-      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                               "scale  of Op(interpolate) "
-                                               "should be greater than 0."));
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
       if (scale_w > 0.) {
         // round down
         out_w = (data_layout == DataLayout::kNCHW
@@ -96,8 +102,11 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) {
         platform::errors::InvalidArgument(
             "OutSize's dimension size must be 1, but got dimention = %d .",
             out_size_dim.size()));
-    PADDLE_ENFORCE_EQ(out_size_dim[0], 1, platform::errors::InvalidArgument(
-                                              "OutSize's dim[0] must be 1"));
+    PADDLE_ENFORCE_EQ(
+        out_size_dim[0], 1,
+        platform::errors::InvalidArgument(
+            "OutSize's 0-th dimension's value must be 1, but got value = %d .",
+            out_size_dim[0]));
     ctx->ShareLoD("X", "Out");
     return;
   }
@@ -170,9 +179,17 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
       scale_h = scale[0];
       scale_w = scale[1];
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
       if (scale_h > 0. && scale_w > 0.) {
         // round down
         out_h = (data_layout == DataLayout::kNCHW
@@ -278,9 +295,23 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {
       scale_h = scale[1];
       scale_w = scale[2];
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
+      PADDLE_ENFORCE_EQ(
+          scale_d > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_d in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_d));
       if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
         // round down
         out_d = (data_layout == DataLayout::kNCHW
@@ -359,13 +390,41 @@ class InterpolateV2Op : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    framework::LibraryType library = framework::LibraryType::kPlain;
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+
+#ifdef PADDLE_WITH_MKLDNN
+    auto interp_method = ctx.Attr<std::string>("interp_method");
+    // TODO(danqing): support other interp_method
+    if (this->CanMKLDNNBeUsed(ctx, data_type) &&
+        (interp_method == "nearest" || interp_method == "bilinear")) {
+      layout = framework::DataLayout::kMKLDNN;
+      library = framework::LibraryType::kMKLDNN;
+    }
+#endif
+
+    return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
   }
 
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const Tensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
+#ifdef PADDLE_WITH_MKLDNN
+    if ((expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN) &&
+        (tensor.layout() != framework::DataLayout::kMKLDNN)) {
+      auto attrs = Attrs();
+      auto ar = paddle::framework::AttrReader(attrs);
+      const std::string data_format = ar.Get<std::string>("data_layout");
+      auto dl = framework::StringToDataLayout(data_format);
+      // Some models may have intentionally set "AnyLayout" for pool
+      // op. Treat this as NCHW (default data_format value)
+      if (dl != framework::DataLayout::kAnyLayout) {
+        return framework::OpKernelType(expected_kernel_type.data_type_,
+                                       tensor.place(), dl);
+      }
+    }
+#endif
     if (var_name == "SizeTensor" || var_name == "Scale") {
       return expected_kernel_type;
     }
@@ -436,6 +495,9 @@ class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker {
                  "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , "
                  "can be \'1\' for src_idx = scale*dst_index .")
         .SetDefault(1);
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
     AddComment(R"DOC(
           This operator samples input X to given output shape by using specified
           interpolation method, the interpolation methods can be \"nearest\"
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index e5002e72d0edd7..6745592c5c1a8b 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -982,15 +982,21 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx,
     if (scale_tensor != nullptr) {
       auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
       scale_w = scale_data[0];
-      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                               "scale  of Op(interpolate) "
-                                               "should be greater than 0."));
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
     } else {
       if (scale.size() > 0) {
         scale_w = scale[0];
-        PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                                 "scale  of Op(interpolate) "
-                                                 "should be greater than 0."));
+        PADDLE_ENFORCE_EQ(
+            scale_w > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_w in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_w));
       }
     }
     if (scale_w > 0.) {
@@ -1081,18 +1087,36 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx,
         scale_h = scale_data[0];
         scale_w = scale_data[0];
       }
+
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
     } else {
       if (scale.size() > 1) {
         scale_w = scale[1];
         scale_h = scale[0];
+
         PADDLE_ENFORCE_EQ(
-            scale_w > 0 && scale_h > 0, true,
-            platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                              "should be greater than 0."));
+            scale_w > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_w in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_w));
+        PADDLE_ENFORCE_EQ(
+            scale_h > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_h in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_h));
       }
     }
     if (scale_w > 0. && scale_h > 0.) {
@@ -1216,10 +1240,25 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
         scale_h = scale_data[0];
         scale_w = scale_data[0];
       }
+
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
+      PADDLE_ENFORCE_EQ(
+          scale_d > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_d in input 'Scale' Tensor of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_d));
     } else {
       if (scale.size() > 1) {
         scale_d = scale[0];
@@ -1227,9 +1266,23 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx,
         scale_w = scale[2];
 
         PADDLE_ENFORCE_EQ(
-            scale_w > 0 && scale_h > 0 && scale_d > 0, true,
-            platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                              "should be greater than 0."));
+            scale_w > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_w in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_w));
+        PADDLE_ENFORCE_EQ(
+            scale_h > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_h in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_h));
+        PADDLE_ENFORCE_EQ(
+            scale_d > 0, true,
+            platform::errors::InvalidArgument(
+                "The scale_d in Attr(scale) of Operator(interpolate) "
+                "should be greater than 0, but received value is %d.",
+                scale_d));
       }
     }
     if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
@@ -1334,16 +1387,22 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
   if (scale_tensor != nullptr) {
     auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
     scale_w = scale_data[0];
-    PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                             "scale  of Op(interpolate) "
-                                             "should be greater than 0."));
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_w));
   } else {
     if (scale.size() > 0) {
       scale_w = scale[0];
 
-      PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument(
-                                               "scale  of Op(interpolate) "
-                                               "should be greater than 0."));
+      PADDLE_ENFORCE_EQ(
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
     }
   }
   if (scale_w > 0.) {
@@ -1433,19 +1492,36 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
       scale_h = scale_data[0];
       scale_w = scale_data[0];
     }
+
+    PADDLE_ENFORCE_EQ(
+        scale_w > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_w));
     PADDLE_ENFORCE_EQ(
-        scale_w > 0 && scale_h > 0, true,
-        platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                          "should be greater than 0."));
+        scale_h > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_h));
   } else {
     if (scale.size() > 1) {
       scale_w = scale[1];
       scale_h = scale[0];
 
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
     }
   }
   if (scale_w > 0. && scale_h > 0.) {
@@ -1581,9 +1657,23 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
       scale_w = scale_data[0];
     }
     PADDLE_ENFORCE_EQ(
-        scale_w > 0 && scale_h > 0 && scale_d > 0, true,
-        platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                          "should be greater than 0."));
+        scale_w > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_w in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_w));
+    PADDLE_ENFORCE_EQ(
+        scale_h > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_h in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_h));
+    PADDLE_ENFORCE_EQ(
+        scale_d > 0, true,
+        platform::errors::InvalidArgument(
+            "The scale_d in input 'Scale' Tensor of Operator(interpolate) "
+            "should be greater than 0, but received value is %d.",
+            scale_d));
   } else {
     if (scale.size() > 1) {
       scale_d = scale[0];
@@ -1591,9 +1681,23 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
       scale_w = scale[2];
 
       PADDLE_ENFORCE_EQ(
-          scale_w > 0 && scale_h > 0 && scale_d > 0, true,
-          platform::errors::InvalidArgument("scale  of Op(interpolate) "
-                                            "should be greater than 0."));
+          scale_w > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_w in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_w));
+      PADDLE_ENFORCE_EQ(
+          scale_h > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_h in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_h));
+      PADDLE_ENFORCE_EQ(
+          scale_d > 0, true,
+          platform::errors::InvalidArgument(
+              "The scale_d in Attr(scale) of Operator(interpolate) "
+              "should be greater than 0, but received value is %d.",
+              scale_d));
     }
   }
   if (scale_d > 0. && scale_h > 0. && scale_w > 0.) {
diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc
index 95549319cd2096..c0c228ef22af3e 100644
--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
@@ -80,8 +80,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
       default_scale.mutable_data<T>(framework::make_ddim(axes), place);
       Tensor value(x->type());
       value.mutable_data<T>({1}, place);
-      TensorFromVector(std::vector<T>{static_cast<T>(1.0)},
-                       ctx.device_context(), &value);
+      FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
       auto runner =
           NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
       runner.Run(stream);
@@ -95,8 +94,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
       default_bias.mutable_data<T>(framework::make_ddim(axes), place);
       Tensor value(x->type());
       value.mutable_data<T>({1}, place);
-      TensorFromVector(std::vector<T>{static_cast<T>(0)}, ctx.device_context(),
-                       &value);
+      FillNpuTensorWithConstant<T>(&value, static_cast<T>(0));
       auto runner =
           NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}});
       runner.Run(stream);
@@ -251,8 +249,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
       default_scale.mutable_data<T>(framework::make_ddim(axes), place);
       Tensor value(x->type());
       value.mutable_data<T>({1}, place);
-      TensorFromVector(std::vector<T>{static_cast<T>(1.0)},
-                       ctx.device_context(), &value);
+      FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
       auto runner =
           NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
       runner.Run(stream);
diff --git a/paddle/fluid/operators/load_combine_op_npu.cc b/paddle/fluid/operators/load_combine_op_npu.cc
new file mode 100644
index 00000000000000..4b9b96c23b0b71
--- /dev/null
+++ b/paddle/fluid/operators/load_combine_op_npu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/load_combine_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    load_combine,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
+    ops::LoadCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/load_op_npu.cc b/paddle/fluid/operators/load_op_npu.cc
new file mode 100644
index 00000000000000..1f532803458310
--- /dev/null
+++ b/paddle/fluid/operators/load_op_npu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/load_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    load, ops::LoadOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
+    ops::LoadOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index fab2d7f7aa0542..9574b325ef77fd 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -28,6 +28,7 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> {
     auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");      // int tensor
     auto *output_t = ctx.Output<framework::LoDTensor>("Out");  // float tensor
     auto *table_t = ctx.Input<framework::LoDTensor>("W");
+
     auto *table_var = ctx.InputVar("W");
     PADDLE_ENFORCE_EQ(
         table_var->IsType<framework::LoDTensor>(), true,
@@ -59,18 +60,16 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    // step2: ZerosLike x in device
-    Tensor zeroslike_w(table_grad_t->type());
-    zeroslike_w.Resize(table_grad_t->dims());
-    auto p = zeroslike_w.mutable_data<T>(ctx.GetPlace());
-
-    platform::NPUMemsetAsync(static_cast<void *>(p), 0,
-                             zeroslike_w.numel() * sizeof(T), stream);
+    auto runner_zeros =
+        NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t});
+    runner_zeros.Run(stream);
 
-    table_grad_t->mutable_data<T>(ctx.GetPlace());
+    // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
+    // can be different tensor, but in cann 20.2+, it does inplace operation.
+    // Thus, the first input and output should be same tensor.
     auto runner_scatter =
-        NpuOpRunner("ScatterAdd", {zeroslike_w, *ids_t, *output_grad_t},
-                    {*table_grad_t}, {});
+        NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
+                    {*table_grad_t}, {{"use_locking", true}});
     runner_scatter.Run(stream);
   }
 };
@@ -82,9 +81,11 @@ namespace ops = paddle::operators;
 REGISTER_OP_NPU_KERNEL(
     lookup_table_v2,
     ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
     ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext,
                                 paddle::platform::float16>);
 
 REGISTER_OP_NPU_KERNEL(
     lookup_table_v2_grad, ops::LookupTableV2GradNPUKernel<float>,
+    ops::LookupTableV2GradNPUKernel<int>,
     ops::LookupTableV2GradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc b/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc
deleted file mode 100644
index f37915834bd756..00000000000000
--- a/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <cmath>
-#include <iostream>
-#include <numeric>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/string/printf.h"
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-namespace m = paddle::operators::math;
-
-USE_OP(lookup_table_v2);
-USE_OP_DEVICE_KERNEL(lookup_table_v2, NPU);
-
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto ids = scope->Var("Ids");
-  auto out = scope->Var("Out");
-  auto w = scope->Var("W");
-
-  auto ids_t = ids->GetMutable<f::LoDTensor>();
-  auto out_t = out->GetMutable<f::LoDTensor>();
-  auto w_t = w->GetMutable<f::LoDTensor>();
-  int bsz = 10;
-  int dim = 32;
-  int seqlen = 8;
-  int vocab_size = 100;
-  TensorFromVector(std::vector<int64_t>(bsz * seqlen, 3), ctx, ids_t);
-  std::vector<T> val(vocab_size * dim, 10.);
-  TensorFromVector(val, ctx, w_t);
-  ids_t->Resize({bsz, seqlen});
-  w_t->Resize({vocab_size, dim});
-  out_t->Resize({bsz, seqlen, dim});
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  out_t->mutable_data<T>(place);
-  f::AttributeMap attrs = {{}};
-  auto op = f::OpRegistry::CreateOp("lookup_table_v2",
-                                    {{"W", {"W"}}, {"Ids", {"Ids"}}},
-                                    {{"Out", {"Out"}}}, attrs);
-  op->Run(*scope, place);
-  std::vector<T> out_v;
-  TensorToVector(*out_t, ctx, &out_v);
-  ctx.Wait();
-  EXPECT_EQ(out_t->numel(), bsz * seqlen * dim);
-  T res = std::accumulate(out_v.begin(), out_v.end(), 0.);
-  float eps = 1.e-6;
-  EXPECT_LT(fabs(res - bsz * seqlen * dim * 10.), eps);
-}
-
-template <typename T>
-void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto w = scope->Var("W");
-  auto ids = scope->Var("Ids");
-  auto out = scope->Var("DOut");
-  auto dw = scope->Var("DW");
-
-  auto w_t = w->GetMutable<f::LoDTensor>();
-  auto ids_t = ids->GetMutable<f::LoDTensor>();
-  auto out_t = out->GetMutable<f::LoDTensor>();
-  auto dw_t = dw->GetMutable<f::LoDTensor>();
-
-  int bsz = 2;
-  int dim = 2;
-  int seqlen = 2;
-  int vocab_size = 4;
-
-  std::vector<int64_t> val_int(bsz * seqlen, 3);
-  std::vector<T> val(vocab_size * dim, 0.);
-  std::vector<T> val_out(bsz * seqlen * dim, 1.);
-
-  TensorFromVector(val_int, ctx, ids_t);
-  TensorFromVector(val, ctx, w_t);
-  TensorFromVector(val, ctx, dw_t);
-  TensorFromVector(val_out, ctx, out_t);
-
-  w_t->Resize({vocab_size, dim});
-  ids_t->Resize({bsz, seqlen});
-  out_t->Resize({bsz, seqlen, dim});
-  dw_t->Resize({vocab_size, dim});
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  out_t->mutable_data<T>(place);
-  w_t->mutable_data<T>(place);
-  dw_t->mutable_data<T>(place);
-  f::AttributeMap attrs = {{}};
-  auto op = f::OpRegistry::CreateOp(
-      "lookup_table_v2_grad",
-      {{"Ids", {"Ids"}}, {"W", {"W"}}, {"Out@GRAD", {"DOut"}}},
-      {{"W@GRAD", {"DW"}}}, attrs);
-  op->Run(*scope, place);
-  ctx.Wait();
-  std::vector<T> w_v;
-  TensorToVector(*dw_t, ctx, &w_v);
-  ctx.Wait();
-  EXPECT_EQ(dw_t->numel(), vocab_size * dim);
-  T res = std::accumulate(w_v.begin(), w_v.end(), 0.);
-  float eps = 1.e-6;
-  EXPECT_LT(fabs(res - bsz * seqlen * dim), eps);
-}
-
-TEST(lookup_table_v2, NPU_fp32) {
-  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
-}
-
-TEST(lookup_table_v2_grad, NPU_fp32) {
-  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx);
-}
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 64b533de098cad..05d42f02c1003a 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -15,6 +15,7 @@
 #ifdef PADDLE_WITH_MKLML
 #include <mkl.h>
 #endif
+
 #include <algorithm>
 #include <cmath>
 #include <limits>
@@ -28,6 +29,19 @@
 namespace paddle {
 namespace operators {
 namespace math {
+namespace detail {
+
+template <typename T>
+static void axpy(int n, const T alpha, const T *x, const int incx, T *y,
+                 const int incy) {
+  // Y = Y + alpha * X
+  while (n-- > 0) {
+    *y += alpha * *x;
+    y = y + incy;
+    x = x + incx;
+  }
+}
+}  // namespace detail
 
 template <typename T>
 struct CBlas;
@@ -43,6 +57,11 @@ struct CBlas<int8_t> {
 
 template <>
 struct CBlas<platform::bfloat16> {
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    detail::axpy(args...);
+  }
+
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
     PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index 68179a68574a01..0bdc7b69434221 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -27,6 +27,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/float16.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
@@ -49,6 +50,7 @@ template struct SetConstant<platform::CPUDeviceContext, platform::complex128>;
 
 #ifdef PADDLE_WITH_XPU
 template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
+template struct SetConstant<platform::XPUDeviceContext, platform::bfloat16>;
 template struct SetConstant<platform::XPUDeviceContext, float>;
 template struct SetConstant<platform::XPUDeviceContext, double>;
 template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 2b93cd926081ec..f94c1bf696cdad 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
@@ -33,6 +34,7 @@ using complex64 = paddle::platform::complex64;
 using complex128 = paddle::platform::complex128;
 
 template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
+template struct SetConstant<platform::CUDADeviceContext, platform::bfloat16>;
 template struct SetConstant<platform::CUDADeviceContext, float>;
 template struct SetConstant<platform::CUDADeviceContext, double>;
 template struct SetConstant<platform::CUDADeviceContext, uint8_t>;
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu
index 0b615cefac4eed..b49b5036ac42e2 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/fluid/operators/math/segment_pooling.cu
@@ -25,14 +25,12 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename T, typename Index, int DimTileSize>
-__global__ void SegmentMeanCustomKernel(
-    const Index* segment_ids, const T* input, T* output, T* summed_ids,
-    const Index input_length_size, const Index inner_dim_size,
-    const Index output_length_size, const Index total_stripe_count) {
+__global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids,
+                                    const Index input_length_size,
+                                    const Index total_stripe_count) {
   CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) {
-    const Index segment_offset = stripe_index % inner_dim_size;
-    const Index dim_index_base =
-        stripe_index / inner_dim_size * Index(DimTileSize);
+    const Index segment_offset = stripe_index;
+    const Index dim_index_base = stripe_index * Index(DimTileSize);
     const Index actual_height =
         min(Index(DimTileSize), input_length_size - dim_index_base);
 
@@ -41,19 +39,20 @@ __global__ void SegmentMeanCustomKernel(
     if (dim_index_base > 0) {
       last_segment_id = segment_ids[dim_index_base - 1];
     }
-    if (segment_offset == 0) {
-      T sum = T(0);
-      for (Index j = 0; j < actual_height; j++) {
-        Index current_segment_id = segment_ids[dim_index_base + j];
-        // Note(ZHUI): following check may cause
-        // cudaErrorLaunchOutOfResources.
-        // PADDLE_ENFORCE(current_segment_id >= last_segment_id,
-        //               "the segment ids should be sorted, but got "
-        //               "segment_ids[%d]:%d > segment_ids[%d]:%d.",
-        //               dim_index_base + j - 1, dim_index_base + j,
-        //               last_segment_id, current_segment_id);
-
-        if (j > 0 && current_segment_id > last_segment_id) {
+    T sum = T(0);
+    for (Index j = 0; j < actual_height; j++) {
+      Index current_segment_id = segment_ids[dim_index_base + j];
+      PADDLE_ENFORCE(current_segment_id >= last_segment_id,
+                     "the segment ids should be sorted, but got "
+                     "segment_ids[%d]:%d > segment_ids[%d]:%d.",
+                     dim_index_base + j - 1, dim_index_base + j,
+                     last_segment_id, current_segment_id);
+      if (current_segment_id > last_segment_id) {
+        for (Index interval_id = last_segment_id + 1;
+             interval_id < current_segment_id; ++interval_id) {
+          *(summed_ids + interval_id) = 0;
+        }
+        if (j > 0) {
           if (last_segment_id == first_segment_id) {
             platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
           } else {
@@ -61,33 +60,60 @@ __global__ void SegmentMeanCustomKernel(
           }
           sum = T(0);
         }
-        sum += T(1);
-        last_segment_id = current_segment_id;
       }
-      platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+      sum += T(1);
+      last_segment_id = current_segment_id;
+    }
+    platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
+  }
+}
+
+template <typename T, typename Index, int DimTileSize>
+__global__ void SegmentMeanKernel(const Index* segment_ids, const T* input,
+                                  T* output, T* summed_ids,
+                                  const Index input_length_size,
+                                  const Index inner_dim_size,
+                                  const Index output_length_size,
+                                  const Index total_stripe_count) {
+  CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) {
+    const Index segment_offset = stripe_index % inner_dim_size;
+    const Index dim_index_base =
+        stripe_index / inner_dim_size * Index(DimTileSize);
+    const Index actual_height =
+        min(Index(DimTileSize), input_length_size - dim_index_base);
+
+    Index first_segment_id = segment_ids[dim_index_base];
+    Index last_segment_id = -1;
+    if (dim_index_base > 0) {
+      last_segment_id = segment_ids[dim_index_base - 1];
     }
-    // ensure last_segment_id is the largest
-    last_segment_id = output_length_size;
-    __syncthreads();
     T sum = T(0);
     for (Index j = 0; j < actual_height; j++) {
       Index current_segment_id = segment_ids[dim_index_base + j];
       if (current_segment_id > last_segment_id) {
-        const Index output_index =
-            last_segment_id * inner_dim_size + segment_offset;
-        if (last_segment_id == first_segment_id) {
-          platform::CudaAtomicAdd(output + output_index,
-                                  sum / *(summed_ids + last_segment_id));
-        } else {
-          *(output + output_index) = sum / *(summed_ids + last_segment_id);
+        // reset the interval value which do not have corresponding ids.
+        for (Index interval_id = last_segment_id + 1;
+             interval_id < current_segment_id; ++interval_id) {
+          *(output + interval_id * inner_dim_size + segment_offset) = T(0);
+        }
+
+        if (j > 0) {
+          Index output_index =
+              last_segment_id * inner_dim_size + segment_offset;
+
+          if (last_segment_id == first_segment_id) {
+            platform::CudaAtomicAdd(output + output_index,
+                                    sum / *(summed_ids + last_segment_id));
+          } else {
+            *(output + output_index) = sum / *(summed_ids + last_segment_id);
+          }
+          sum = T(0);
         }
-        sum = T(0);
       }
       sum += input[(dim_index_base + j) * inner_dim_size + segment_offset];
       last_segment_id = current_segment_id;
     }
-    const Index output_index =
-        last_segment_id * inner_dim_size + segment_offset;
+    Index output_index = last_segment_id * inner_dim_size + segment_offset;
     platform::CudaAtomicAdd(output + output_index,
                             sum / *(summed_ids + last_segment_id));
   }
@@ -122,7 +148,7 @@ __global__ void SegmentOpsKernel(const Index* segment_ids, const T* input,
         // reset the interval value which do not have corresponding ids.
         for (Index interval_id = last_segment_id + 1;
              interval_id < current_segment_id; ++interval_id) {
-          *(output + interval_id * inner_dim_size + segment_offset) = 0;
+          *(output + interval_id * inner_dim_size + segment_offset) = T(0);
         }
         // don't update result when j=0
         if (j > 0) {
@@ -272,11 +298,25 @@ class SegmentPoolFunctor<platform::CUDADeviceContext, T, IndexT> {
                   framework::Tensor* output,
                   framework::Tensor* summed_ids = nullptr,
                   const std::string pooltype = "SUM") {
+    if (pooltype == "MEAN") {
+      // Sum the segment id num first
+      T DimTileSize = 8;
+      auto input_length_size = segment_ids.numel();
+      auto total_stripe_count =
+          (input_length_size + DimTileSize - 1) / DimTileSize;
+      auto config = platform::GetGpuLaunchConfig1D(ctx, total_stripe_count);
+      SegmentSumIdsKernel<
+          T, IndexT, IndexT(8)><<<config.block_per_grid.x,
+                                  config.thread_per_block.x, 0, ctx.stream()>>>(
+          segment_ids.data<IndexT>(), summed_ids->data<T>(), input_length_size,
+          total_stripe_count);
+    }
+
     auto h = ArrangeHelper<IndexT>(input.numel(), segment_ids.dims()[0],
                                    output->dims()[0]);
     auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count);
     if (pooltype == "MEAN") {
-      SegmentMeanCustomKernel<
+      SegmentMeanKernel<
           T, IndexT, IndexT(8)><<<config.block_per_grid.x,
                                   config.thread_per_block.x, 0, ctx.stream()>>>(
           segment_ids.data<IndexT>(), input.data<T>(), output->data<T>(),
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index f7b16453e0133b..b9a1854a66118e 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -285,6 +285,8 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
+                                        platform::bfloat16>;
 
 // This is a separated namespace for manipulate SelectedRows typed
 // data. Like merge duplicated rows, adding two SelectedRows etc.
@@ -294,21 +296,17 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
 // add or mul.
 namespace scatter {
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_floating_point<T>::value &&
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
-                   size_t data_len, const T* in, T* out) {
-  blas->AXPY(data_len, 1., in, out);
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value>::type
+elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
+                   const T* in, T* out) {
+  blas->AXPY(data_len, T(1.f), in, out);
 }
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    !std::is_floating_point<T>::value &&
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
-                   size_t data_len, const T* in, T* out) {
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value>::type elementwise_add_to(
+    BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len, const T* in,
+    T* out) {
   for (size_t i = 0; i < data_len; i++) {
     out[i] += in[i];
   }
@@ -412,7 +410,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
       out.set_rows(merge_rows);
 
       math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
-      constant_functor(context, out.mutable_value(), 0.0);
+      constant_functor(context, out.mutable_value(), static_cast<T>(0.f));
 
       std::unordered_map<int64_t, size_t> rows_to_id;
       for (size_t i = 0; i < merge_rows.size(); ++i) {
@@ -429,9 +427,9 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
 
         for (size_t i = 0; i < input_rows.size(); i++) {
           size_t out_i = rows_to_id[input_rows[i]];
-          elementwise_add_to<platform::CPUDeviceContext, T>(
-              context, &blas, static_cast<size_t>(input_width),
-              &input_data[i * input_width], &out_data[out_i * input_width]);
+          elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
+                                &input_data[i * input_width],
+                                &out_data[out_i * input_width]);
         }
       }
     }
@@ -524,9 +522,9 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
 
       for (size_t i = 0; i < input_rows.size(); i++) {
         size_t out_i = rows_to_id[input_rows[i]];
-        elementwise_add_to<platform::CPUDeviceContext, T>(
-            context, &blas, static_cast<size_t>(input_width),
-            &input_data[i * input_width], &out_data[out_i * input_width]);
+        elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
+                              &input_data[i * input_width],
+                              &out_data[out_i * input_width]);
       }
     }
     size_t input_width_cast = static_cast<size_t>(input_width);
@@ -547,6 +545,8 @@ template struct MergeAdd<platform::CPUDeviceContext,
                          paddle::platform::complex64>;
 template struct MergeAdd<platform::CPUDeviceContext,
                          paddle::platform::complex128>;
+template struct MergeAdd<platform::CPUDeviceContext,
+                         paddle::platform::bfloat16>;
 
 template struct MergeAverage<platform::CPUDeviceContext, int>;
 template struct MergeAverage<platform::CPUDeviceContext, int64_t>;
diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu
index d78e3385efb29c..a73f76f53be052 100644
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -87,7 +87,11 @@ class Unpool2dMaxFunctor<platform::CUDADeviceContext, T> {
     const T* input_data = input.data<T>();
     const int* indices_data = indices.data<int>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
+#ifdef __HIPCC__
+    int threads = 256;
+#else
     int threads = 1024;
+#endif
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool2dMax<T><<<grid, threads, 0, context.stream()>>>(
         input.numel(), input_data, indices_data, input_height, input_width,
@@ -117,7 +121,11 @@ class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, T> {
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+#ifdef __HIPCC__
+    int threads = 256;
+#else
     int threads = 1024;
+#endif
     int grid = (input.numel() + threads - 1) / threads;
     KernelUnpool2dMaxGrad<T><<<grid, threads, 0, context.stream()>>>(
         input.numel(), input_data, indices_data, input_height, input_width,
diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc
index 676086bd080633..d6e982039fa290 100644
--- a/paddle/fluid/operators/mean_op_npu.cc
+++ b/paddle/fluid/operators/mean_op_npu.cc
@@ -68,10 +68,8 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
     Tensor mean_tensor(grad->type());
     mean_tensor.Resize({1});
     mean_tensor.mutable_data<T>(context.GetPlace());
-    std::vector<float> mean_vec;
-    mean_vec.push_back(1.0 / static_cast<float>(IG->numel()));
-    framework::TensorFromVector(mean_vec, context.device_context(),
-                                &mean_tensor);
+    FillNpuTensorWithConstant<T>(
+        &mean_tensor, static_cast<T>(1.0 / static_cast<float>(IG->numel())));
 
     // means mul ones
     Tensor mean_ma(grad->type());
diff --git a/paddle/fluid/operators/memcpy_op.cc b/paddle/fluid/operators/memcpy_op.cc
index d10d5bf12e6b4a..ecd2d48dcbd102 100644
--- a/paddle/fluid/operators/memcpy_op.cc
+++ b/paddle/fluid/operators/memcpy_op.cc
@@ -105,16 +105,18 @@ class MemcpyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
               "is the same as input X.");
     AddAttr<int>("dst_place_type",
                  "Determine the dst place of tensor copy. "
-                 "By Now it ONLY support CUDAPlace and CUDAPinnedPlace. Other "
-                 "place type is Unimplemented and will cause ERROR."
+                 "By Now it ONLY support CUDAPlace <-> CUDAPinnedPlace or "
+                 "NPUPlace <-> CPUPlace. "
+                 "Other place type is Unimplemented and will cause ERROR."
                  "0: dst is on CPUPlace. "
                  "1: dst is on CUDAPlace. "
                  "2: dst is on CUDAPinnedPlace. "
-                 "3: dst is on XPUPlace. ");
+                 "3: dst is on XPUPlace. "
+                 "4: dst is on NPUPlace. ");
     AddComment(R"DOC(
     Memcpy Operator.
-    By now, it ONLY supports the memcopy between CUDAPinnedPlace and CUDAPlace,
-    and used as an internal op by Recompute-Offload.
+    By now, it ONLY supports the memcopy between CUDAPinnedPlace <-> CUDAPlace or 
+    NPUPlace <-> CPUPlace, and used as an internal op by Recompute-Offload.
     You would have to update it if you want other more capacities.
 
 Out = X,  when type in [LoDTensor]
@@ -139,10 +141,18 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double,
                                ops::MemcpyKernel, plat::float16,
                                ops::MemcpyKernel);
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ROCM)
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double,
                                 ops::MemcpyKernel, int, ops::MemcpyKernel,
                                 int64_t, ops::MemcpyKernel, bool,
                                 ops::MemcpyKernel, plat::float16,
                                 ops::MemcpyKernel);
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double,
+                               ops::MemcpyKernel, int, ops::MemcpyKernel,
+                               int64_t, ops::MemcpyKernel, bool,
+                               ops::MemcpyKernel, plat::float16,
+                               ops::MemcpyKernel);
+#endif
diff --git a/paddle/fluid/operators/memcpy_op.h b/paddle/fluid/operators/memcpy_op.h
old mode 100755
new mode 100644
index f81ca05f4380a4..63a41cc7237310
--- a/paddle/fluid/operators/memcpy_op.h
+++ b/paddle/fluid/operators/memcpy_op.h
@@ -51,7 +51,17 @@ class MemcpyFunctor {
     } else if (dst_place_type_ == 1) {
       framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
                             &out_tensor);
-    } else {
+    }
+#ifdef PADDLE_WITH_ASCEND_CL
+    else if (dst_place_type_ == 0) {  // NOLINT
+      framework::TensorCopy(lod_tensor, platform::CPUPlace(), dev_ctx_,
+                            &out_tensor);
+    } else if (dst_place_type_ == 4) {
+      framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
+                            &out_tensor);
+    }
+#endif
+    else {  // NOLINT
       PADDLE_THROW(platform::errors::Unimplemented(
           "memcpy dst_place_type: %d is not supported yet.", dst_place_type_));
     }
diff --git a/paddle/fluid/operators/meshgrid_op.cc b/paddle/fluid/operators/meshgrid_op.cc
index 33f71b4adc066f..54600e26bb57f3 100644
--- a/paddle/fluid/operators/meshgrid_op.cc
+++ b/paddle/fluid/operators/meshgrid_op.cc
@@ -157,3 +157,17 @@ REGISTER_OP_CPU_KERNEL(
     ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, double>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    meshgrid, ops::MeshgridKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    meshgrid_grad,
+    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/meshgrid_op.cu b/paddle/fluid/operators/meshgrid_op.cu
deleted file mode 100644
index dc813a07f8c8c1..00000000000000
--- a/paddle/fluid/operators/meshgrid_op.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/meshgrid_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    meshgrid, ops::MeshgridKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::MeshgridKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    meshgrid_grad,
-    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::MeshgridGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
index 64a1903c2da4ff..9d80286f4c4efa 100644
--- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
@@ -33,7 +33,7 @@ class InterpolateMKLDNNHandler
     : public platform::MKLDNNHandlerT<T, dnnl::resampling_forward> {
  public:
   InterpolateMKLDNNHandler(const dnnl::algorithm algo,
-                           const paddle::platform::MKLDNNDeviceContext& dev_ctx,
+                           const platform::MKLDNNDeviceContext& dev_ctx,
                            const dnnl::engine engine, platform::Place cpu_place,
                            const Tensor* x, Tensor* z,
                            const std::string& uniq_name)
@@ -94,19 +94,32 @@ class InterpolateMKLDNNKernel : public framework::OpKernel<T> {
         out_dims = out_size_data;
       }
     } else {
-      float scale;
+      std::vector<float> scale;
+      scale.reserve(3);
       auto scale_tensor = ctx.Input<Tensor>("Scale");
       if (scale_tensor != nullptr) {
         auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
-        scale = scale_data[0];
+        scale.resize(3, scale_data[0]);
+        std::copy(scale_data.begin(), scale_data.end(), scale.begin());
       } else {
-        scale = ctx.Attr<float>("scale");
+        std::string op_type = ctx.Type();
+
+        if (op_type.find("v2") == std::string::npos) {  // v1
+          scale.push_back(ctx.Attr<float>("scale"));
+          scale.push_back(scale[0]);
+          scale.push_back(scale[0]);
+        } else {  // v2
+          std::vector<float> scale_attr = ctx.Attr<std::vector<float>>("scale");
+          scale.resize(3, scale_attr[0]);
+          std::copy(scale_attr.begin(), scale_attr.end(), scale.begin());
+        }
       }
-      if (scale > 0) {
+      if (scale[0] > 0.0f && scale[1] > 0.0f && scale[2] > 0.0f) {
+        int j = 0;
         std::vector<int64_t> in_dhw_vec = framework::vectorize(in_dhw_dims);
         std::transform(
             in_dhw_vec.begin(), in_dhw_vec.end(), out_dims.begin(),
-            [&](int64_t i) -> int { return static_cast<int>(i * scale); });
+            [&](int64_t i) -> int { return static_cast<int>(i * scale[j++]); });
       }
     }
 
@@ -172,3 +185,8 @@ REGISTER_OP_KERNEL(nearest_interp, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::InterpolateMKLDNNKernel<float>);
 REGISTER_OP_KERNEL(bilinear_interp, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::InterpolateMKLDNNKernel<float>);
+
+REGISTER_OP_KERNEL(nearest_interp_v2, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::InterpolateMKLDNNKernel<float>);
+REGISTER_OP_KERNEL(bilinear_interp_v2, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::InterpolateMKLDNNKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
index aafff5248a0244..d6cd76b697f518 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc
@@ -50,7 +50,7 @@ class CacheTester {
     platform::CPUPlace place;
     onednn_dev_ctx_ =
         dynamic_cast<platform::MKLDNNDeviceContext *>(pool.Get(place));
-    onednn_dev_ctx_->ResetBlobMap();
+    onednn_dev_ctx_->ResetBlobMap(nullptr);
   }
 
   bool Analyze(unsigned short int num_entries) {
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu
index 71bfacb9283850..3c85da3c52c6c9 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
@@ -11,7 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <thrust/device_vector.h>
 #include <thrust/for_each.h>
+#include <thrust/host_vector.h>
 #include <thrust/tuple.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/modified_huber_loss_op.h"
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index 9fcc629233891b..843736833f8156 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -32,6 +32,12 @@ namespace cub = hipcub;
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 
+#ifdef __HIPCC__
+#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
+#else
+#define LAUNCH_BOUNDS(BlockDim)
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -58,12 +64,10 @@ using DataLayout = framework::DataLayout;
 //          axis=(n,h,w)))
 
 template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void DoubleGradComputeDX(const T *x, const T *mean,
-                                    const T *variance, const T *ddx,
-                                    const T *dy, const T *scale,
-                                    const T *ddscale, const int N, const int C,
-                                    const int sample_size, const double epsilon,
-                                    T *dx) {
+__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDX(
+    const T *x, const T *mean, const T *variance, const T *ddx, const T *dy,
+    const T *scale, const T *ddscale, const int N, const int C,
+    const int sample_size, const double epsilon, T *dx) {
   const int outer_size = C;
   const int inner_size = N * sample_size;
 
@@ -160,12 +164,10 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean,
 //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
 //           np.mean(ddx * (x - mean), axis=(n,h,w)))
 template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void DoubleGradComputeDDY(const T *x, const T *mean,
-                                     const T *variance, const T *ddscale,
-                                     const T *ddbias, const T *ddx,
-                                     const T *scale, const int N, const int C,
-                                     const int sample_size,
-                                     const double epsilon, T *ddy) {
+__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDDY(
+    const T *x, const T *mean, const T *variance, const T *ddscale,
+    const T *ddbias, const T *ddx, const T *scale, const int N, const int C,
+    const int sample_size, const double epsilon, T *ddy) {
   const int outer_size = C;
   const int inner_size = N * sample_size;
 
@@ -238,11 +240,10 @@ __global__ void DoubleGradComputeDDY(const T *x, const T *mean,
 //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
 //            ddx
 template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void DoubleGradComputeDScale(const T *x, const T *mean,
-                                        const T *variance, const T *ddx,
-                                        const T *dy, const int N, const int C,
-                                        const int sample_size,
-                                        const double epsilon, T *dscale) {
+__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDScale(
+    const T *x, const T *mean, const T *variance, const T *ddx, const T *dy,
+    const int N, const int C, const int sample_size, const double epsilon,
+    T *dscale) {
   const int outer_size = C;
   const int inner_size = N * sample_size;
 
@@ -302,7 +303,7 @@ __global__ void DoubleGradComputeDScale(const T *x, const T *mean,
 
 // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
 template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void DoubleGradComputeDScaleWithGlobal(
+__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDScaleWithGlobal(
     const T *ddx, const T *variance, const T *dy, const double epsilon,
     const int N, const int C, const int sample_size, T *dscale) {
   int outer_size = C;
@@ -422,8 +423,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
     set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
   }
   const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
-
+#ifdef __HIPCC__
+  const int block = 256;
+#else
   const int block = 512;
+#endif
   int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(max_threads / block, 1);
   int grid = std::min(C, max_blocks);
@@ -532,6 +536,5 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
     }
   }
 }
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc
index aa0c4d2dfd274e..276bfa7b3281b9 100644
--- a/paddle/fluid/operators/npu_op_runner.cc
+++ b/paddle/fluid/operators/npu_op_runner.cc
@@ -64,8 +64,10 @@ aclFormat ConvertToNpuFormat(DataLayout layout) {
   return iter->second;
 }
 
-aclrtStream GetCurrentNPUStream() {
-  int device_id = platform::GetCurrentNPUDeviceId();
+aclrtStream GetCurrentNPUStream(int device_id) {
+  if (device_id == -1) {
+    device_id = platform::GetCurrentNPUDeviceId();
+  }
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto *dev_ctx = static_cast<platform::NPUDeviceContext *>(
       pool.Get(platform::NPUPlace(device_id)));
@@ -299,5 +301,6 @@ void NpuOpRunner::Run(aclrtStream stream) {
   VLOG(4) << "after aclopCompileAndExecute: " << ret;
   PADDLE_ENFORCE_NPU_SUCCESS(ret);
 }
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h
index e178f7fc6e96d8..cfc933c7a76fa7 100644
--- a/paddle/fluid/operators/npu_op_runner.h
+++ b/paddle/fluid/operators/npu_op_runner.h
@@ -86,6 +86,48 @@ class NpuOpRunner {
 
 aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype);
 
+aclrtStream GetCurrentNPUStream(int device_id = -1);
+
+template <typename T>
+void FillNpuTensorWithConstant(Tensor *tensor, T val) {
+  // NOTE(zhiqiu): we found that power sometimes returns 0 when val is small
+  // like 1e-8.
+  constexpr float MIN_PRECISION_FOR_POWER = 1e-3;
+  PADDLE_ENFORCE_EQ(
+      tensor->IsInitialized(), true,
+      platform::errors::InvalidArgument("The tensor should be initialized."));
+  PADDLE_ENFORCE_EQ(
+      platform::is_npu_place(tensor->place()), true,
+      platform::errors::InvalidArgument("The tensor should be on NPUPlace."));
+  // do async for better performance
+  if ((typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) &&
+      static_cast<float>(val) > MIN_PRECISION_FOR_POWER) {
+    Tensor tmp(tensor->type());
+    tmp.Resize(tensor->dims());
+    tmp.mutable_data<T>(tensor->place());
+    auto stream = GetCurrentNPUStream(
+        BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device);
+    platform::NPUMemsetAsync(tmp.data<void>(), 0, tmp.numel() * sizeof(T),
+                             stream);
+    auto runner = NpuOpRunner("Power", {tmp}, {*tensor},
+                              {{"power", static_cast<float>(1)},
+                               {"scale", static_cast<float>(0)},
+                               {"shift", static_cast<float>(val)}});
+    runner.Run(stream);
+  } else {
+    T *array = new T[tensor->numel()];
+    for (unsigned int i = 0; i < tensor->numel(); ++i) {
+      array[i] = static_cast<T>(val);
+    }
+    std::vector<T> vec(tensor->numel(), static_cast<T>(val));
+    // do sync copy
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()),
+                 tensor->data<void>(), platform::CPUPlace(), array,
+                 tensor->numel() * sizeof(T), nullptr);
+    delete[] array;
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
index 621920731fb603..a7886cdd670d4d 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -151,6 +151,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
              "as beta2, this has a higher priority than attr(beta2), the "
              "shape of this tensor MUST BE [1].")
         .AsDispensable();
+    AddInput("EpsilonTensor",
+             "(Tensor<float32>, optional) If provided, Adam will use this "
+             "as epsilon, this has a higher priority than attr(epsilon), the "
+             "shape of this tensor MUST BE [1].")
+        .AsDispensable();
     AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
 
     AddOutput("ParamOut", "(Tensor) Output parameter");
@@ -232,4 +237,13 @@ REGISTER_OP_VERSION(adam)
         paddle::framework::compatible::OpVersionDesc().NewAttr(
             "multi_precision",
             "(bool) Whether to use multi-precision during weight updating.",
-            false));
+            false))
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade adam, add 1 dispensable input [EpsilonTensor].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewInput(
+            "EpsilonTensor",
+            "If provided, Adam will use this as epsilon, "
+            "this has a higher priority than attr(epsilon). "
+            "For better performance in npu kernel. "));
diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu
index 54aea67f4ea1b3..3d6f0f99a52dfb 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
@@ -154,7 +154,7 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
     int64_t min_row_size_to_use_multithread =
         ctx.Attr<int64_t>("min_row_size_to_use_multithread");
     bool lazy_mode = ctx.Attr<bool>("lazy_mode");
-    MPDType epsilon = static_cast<MPDType>(ctx.Attr<float>("epsilon"));
+
     auto* param = ctx.Input<LoDTensor>("Param");
     auto* grad_var = ctx.InputVar("Grad");
     auto* mom1 = ctx.Input<LoDTensor>("Moment1");
@@ -188,6 +188,15 @@ class AdamOpCUDAKernel : public framework::OpKernel<T> {
                             beta2_tensor->numel()));
       beta2 = static_cast<MPDType>(GetAttrFromTensor(beta2_tensor));
     }
+    MPDType epsilon = static_cast<MPDType>(ctx.Attr<float>("epsilon"));
+    if (ctx.HasInput("EpsilonTensor")) {
+      auto* epsilon_tensor = ctx.Input<framework::Tensor>("EpsilonTensor");
+      PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(EpsilonTensor) size must be 1, but get %d",
+                            epsilon_tensor->numel()));
+      epsilon = static_cast<MPDType>(GetAttrFromTensor(epsilon_tensor));
+    }
     VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
             << "beta2_pow.numel() : " << beta2_pow->numel();
     VLOG(3) << "param.numel(): " << param->numel();
diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 6356911f0676a8..9667db8055b90c 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -406,7 +406,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
     int64_t min_row_size_to_use_multithread =
         ctx.Attr<int64_t>("min_row_size_to_use_multithread");
     bool lazy_mode = ctx.Attr<bool>("lazy_mode");
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+
     auto* param = ctx.Input<LoDTensor>("Param");
     auto* grad_var = ctx.InputVar("Grad");
     auto* mom1 = ctx.Input<LoDTensor>("Moment1");
@@ -440,6 +440,15 @@ class AdamOpKernel : public framework::OpKernel<T> {
                             beta2_tensor->numel()));
       beta2 = static_cast<T>(GetAttrFromTensor(beta2_tensor));
     }
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    if (ctx.HasInput("EpsilonTensor")) {
+      auto* epsilon_tensor = ctx.Input<framework::Tensor>("EpsilonTensor");
+      PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(EpsilonTensor) size must be 1, but get %d",
+                            epsilon_tensor->numel()));
+      epsilon = static_cast<T>(GetAttrFromTensor(epsilon_tensor));
+    }
     VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
             << "beta2_pow.numel() : " << beta2_pow->numel();
     VLOG(3) << "param.numel(): " << param->numel();
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
index 134544c2f65bc3..343a6704388623 100644
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
 #include "paddle/fluid/operators/optimizers/adam_op.h"
 
@@ -61,27 +62,71 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     param_out->mutable_data<T>(ctx.GetPlace());
     mom1_out->mutable_data<T>(ctx.GetPlace());
     mom2_out->mutable_data<T>(ctx.GetPlace());
-    beta1_pow_out->mutable_data<T>(ctx.GetPlace());
-    beta2_pow_out->mutable_data<T>(ctx.GetPlace());
 
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform place.
+    if (beta1_pow->place() == platform::CPUPlace()) {
+      T beta1 = *beta1_pow->data<T>();
+      // `mutable_data` operation needs to be done after getting data
+      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(beta1_pow_out, beta1);
+    } else {
+      beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+    }
+    if (beta2_pow->place() == platform::CPUPlace()) {
+      T beta2 = *beta2_pow->data<T>();
+      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(beta2_pow_out, beta2);
+    } else {
+      beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+    }
+
+    const Tensor* beta1_tensor = nullptr;
+    const Tensor* beta2_tensor = nullptr;
+    const Tensor* epsilon_tensor = nullptr;
+
+    Tensor beta1_tmp(framework::proto::VarType::FP32);
+    Tensor beta2_tmp(framework::proto::VarType::FP32);
+    Tensor epsilon_tmp(framework::proto::VarType::FP32);
+
     if (ctx.HasInput("Beta1Tensor")) {
-      auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
+      beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
       PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1,
                         platform::errors::InvalidArgument(
                             "Input(Beta1Tensor) size must be 1, but get %d",
                             beta1_tensor->numel()));
-      beta1 = static_cast<T>(GetAttrFromTensor(beta1_tensor));
+    } else {
+      T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+      beta1_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(&beta1_tmp, beta1);
+      beta1_tensor = &beta1_tmp;
     }
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+
     if (ctx.HasInput("Beta2Tensor")) {
-      auto* beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
-      PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1,
+      beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
+      PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1,
                         platform::errors::InvalidArgument(
                             "Input(Beta2Tensor) size must be 1, but get %d",
                             beta2_tensor->numel()));
-      beta2 = static_cast<T>(GetAttrFromTensor(beta2_tensor));
+    } else {
+      T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+      beta2_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(&beta2_tmp, beta2);
+      beta2_tensor = &beta2_tmp;
+    }
+
+    if (ctx.HasInput("EpsilonTensor")) {
+      epsilon_tensor = ctx.Input<framework::Tensor>("EpsilonTensor");
+      PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input(EpsilonTensor) size must be 1, but get %d",
+                            epsilon_tensor->numel()));
+    } else {
+      T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+      epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
+      FillNpuTensorWithConstant<T>(&epsilon_tmp, epsilon);
+      epsilon_tensor = &epsilon_tmp;
     }
+
     VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
             << "beta2_pow.numel() : " << beta2_pow->numel();
     VLOG(3) << "param.numel(): " << param->numel();
@@ -97,21 +142,6 @@ class AdamNPUKernel : public framework::OpKernel<T> {
                           "beta2 pow output size should be 1, but received "
                           "value is:%d.",
                           beta2_pow_out->numel()));
-
-    // reshape
-    Tensor beta1_tensor(framework::proto::VarType::FP32);
-    beta1_tensor.mutable_data<float>({1}, ctx.GetPlace());
-    TensorFromVector(std::vector<T>{beta1}, ctx.device_context(),
-                     &beta1_tensor);
-    Tensor beta2_tensor(framework::proto::VarType::FP32);
-    beta2_tensor.mutable_data<float>({1}, ctx.GetPlace());
-    TensorFromVector(std::vector<T>{beta2}, ctx.device_context(),
-                     &beta2_tensor);
-
-    Tensor epsilon_tensor(framework::proto::VarType::FP32);
-    epsilon_tensor.mutable_data<T>({1}, ctx.GetPlace());
-    TensorFromVector(std::vector<T>{epsilon}, ctx.device_context(),
-                     &epsilon_tensor);
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
@@ -119,7 +149,7 @@ class AdamNPUKernel : public framework::OpKernel<T> {
         NpuOpRunner("ApplyAdamD",
                     {
                         *param, *mom1, *mom2, *beta1_pow, *beta2_pow, *lr,
-                        beta1_tensor, beta2_tensor, epsilon_tensor, *grad,
+                        *beta1_tensor, *beta2_tensor, *epsilon_tensor, *grad,
                     },
                     {
                         *param_out, *mom1_out, *mom2_out,
@@ -130,22 +160,25 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     // NOTE(zhiqiu): ApplyAdamD updates params inplace, so
     // if param and param_out is not same, we need to do copy.
     if (param_out->data<T>() != param->data<T>()) {
-      ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-      framework::TensorCopySync(*param, ctx.GetPlace(), param_out);
+      framework::TensorCopy(
+          *param, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), param_out);
     }
     if (mom1_out->data<T>() != mom1->data<T>()) {
-      ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-      framework::TensorCopySync(*mom1, ctx.GetPlace(), mom1_out);
+      framework::TensorCopy(
+          *mom1, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), mom1_out);
     }
     if (mom2_out->data<T>() != mom2->data<T>()) {
-      ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-      framework::TensorCopySync(*mom2, ctx.GetPlace(), mom2_out);
+      framework::TensorCopy(
+          *mom2, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), mom2_out);
     }
     auto runner_m1 =
-        NpuOpRunner("Mul", {*beta1_pow, beta1_tensor}, {*beta1_pow_out}, {});
+        NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
     runner_m1.Run(stream);
     auto runner_m2 =
-        NpuOpRunner("Mul", {*beta2_pow, beta2_tensor}, {*beta2_pow_out}, {});
+        NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
     runner_m2.Run(stream);
   }
 };
diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
index 3baba424e8f43d..09f117374499b0 100644
--- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
@@ -35,8 +35,6 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
                           framework::ToTypeName(param_var->Type())));
     using paddle::framework::LoDTensor;
 
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-
     auto& param = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Param"), "Input",
                                   "Param", "Adam");
     // auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
@@ -85,6 +83,11 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
       auto* beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
       beta2 = static_cast<T>(GetAttrFromTensor(beta2_tensor));
     }
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    if (ctx.HasInput("EpsilonTensor")) {
+      auto* epsilon_tensor = ctx.Input<framework::Tensor>("EpsilonTensor");
+      epsilon = static_cast<T>(GetAttrFromTensor(epsilon_tensor));
+    }
     if (grad_var->IsType<framework::LoDTensor>()) {
       auto& grad = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Grad"), "Input",
                                    "Grad", "Adam");
diff --git a/paddle/fluid/operators/optimizers/sgd_op_npu.cc b/paddle/fluid/operators/optimizers/sgd_op_npu.cc
index b7aaff5d457918..a8d19148ef520c 100644
--- a/paddle/fluid/operators/optimizers/sgd_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op_npu.cc
@@ -44,8 +44,9 @@ class SGDNPUKernel : public framework::OpKernel<T> {
     // NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so
     // if param and param_out is not same, we need to do copy.
     if (param_out->data<T>() != param_var->data<T>()) {
-      ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-      framework::TensorCopySync(*param_var, ctx.GetPlace(), param_out);
+      framework::TensorCopy(
+          *param_var, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(), param_out);
     }
   }
 };
diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc
index 0d5c23bed6016e..f91496eeab1420 100644
--- a/paddle/fluid/operators/py_layer_op.cc
+++ b/paddle/fluid/operators/py_layer_op.cc
@@ -60,33 +60,57 @@ void RunPyObject(py::object *py_object,
           outs->size(), result_tuple.size()));
     }
     for (size_t i = 0; i < result_tuple.size(); i++) {
-      if (Py_None != result_tuple[i].ptr()) {
+      if ((*outs)[i] != nullptr) {
+        if (Py_None != result_tuple[i].ptr()) {
+          try {
+            auto result_var =
+                result_tuple[i].cast<std::shared_ptr<imperative::VarBase>>();
+            *(*outs)[i] = result_var->Var();
+          } catch (py::cast_error &) {
+            PADDLE_THROW(platform::errors::InvalidArgument(
+                "The output of `PyLayer.backward` should be `Tensor`."));
+          }
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The %dth input tensor of forward needs gradient and the "
+              "corresponding gradient cannot be None.",
+              i));
+        }
+      } else {
+        if (Py_None != result_tuple[i].ptr()) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The %dth input tensor of forward do not need gradient and the "
+              "corresponding gradient should be `None`.",
+              i));
+        }
+      }
+    }
+  } else {
+    if (1 != outs->size()) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The number of outputs of `PyLayer.backward` should be %d, but "
+          "received 1.",
+          outs->size()));
+    }
+    if ((*outs)[0] != nullptr) {
+      if (Py_None != py_result.ptr()) {
         try {
           auto result_var =
-              result_tuple[i].cast<std::shared_ptr<imperative::VarBase>>();
-          *(*outs)[i] = result_var->Var();
+              py_result.cast<std::shared_ptr<imperative::VarBase>>();
+          *((*outs)[0]) = result_var->Var();
         } catch (py::cast_error &) {
-          PADDLE_THROW(platform::errors::Unimplemented(
+          PADDLE_THROW(platform::errors::InvalidArgument(
               "The output of `PyLayer.backward` should be `Tensor`."));
         }
       } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "The output of `PyLayer.backward` can not be `None`."));
-      }
-    }
-  } else {
-    if (Py_None != py_result.ptr()) {
-      try {
-        auto result_var =
-            py_result.cast<std::shared_ptr<imperative::VarBase>>();
-        *((*outs)[0]) = result_var->Var();
-      } catch (py::cast_error &) {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "The output of `PyLayer.backward` should be `Tensor`."));
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "The input tensor of forward needs gradient, so the output of "
+            "`PyLayer.backward` can not be `None`."));
       }
     } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "The output of `PyLayer.backward` can not be `None`."));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The input tensor of forward do not need gradient, so the output of "
+          "`PyLayer.backward` should be `None`."));
     }
   }
 }
@@ -133,9 +157,12 @@ class PyLayerOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto &op_ = ctx.GetOp();
-    auto pylayer_op = dynamic_cast<const PyLayerOp *>(&op_);
-    if (pylayer_op) {
-      auto py_layer_context = pylayer_op->GetPyLayerContext();
+    auto const_pylayer_op = dynamic_cast<const PyLayerOp *>(&op_);
+    if (const_pylayer_op) {
+      auto pylayer_op = const_cast<PyLayerOp *>(const_pylayer_op);
+
+      // Release contex after executing the compute
+      auto py_layer_context = pylayer_op->ReleasePyLayerContext();
       py::object bk_ctx(py::handle(py_layer_context->GetMutableCtx()), true);
       auto &input_vars = ctx.MultiInputVar("X");
       auto output_vars = ctx.MultiOutputVar("Out");
diff --git a/paddle/fluid/operators/py_layer_op.h b/paddle/fluid/operators/py_layer_op.h
index 133435aa84d71e..d80faab90b2236 100644
--- a/paddle/fluid/operators/py_layer_op.h
+++ b/paddle/fluid/operators/py_layer_op.h
@@ -34,6 +34,10 @@ class PyLayerContext {
   PyLayerContext() = delete;
 
   PyObject* GetMutableCtx() { return context_; }
+  ~PyLayerContext() {
+    py::gil_scoped_acquire guard;
+    Py_XDECREF(context_);
+  }
 
  private:
   PyObject* context_;
@@ -58,8 +62,11 @@ class PyLayerOp : public framework::OperatorWithKernel {
   void SetPyLayerContext(const std::shared_ptr<PyLayerContext>& py_context) {
     py_context_ = py_context;
   }
-  const std::shared_ptr<PyLayerContext>& GetPyLayerContext() const {
-    return py_context_;
+  std::shared_ptr<PyLayerContext> ReleasePyLayerContext() {
+    auto temp = py_context_;
+    py_context_.reset();
+    VLOG(3) << "`py_context_` in the PyLayerOp is released.";
+    return temp;
   }
 
  private:
diff --git a/paddle/fluid/operators/range_op_npu.cc b/paddle/fluid/operators/range_op_npu.cc
index 228372e1e93e03..a9a2effd2eb9db 100644
--- a/paddle/fluid/operators/range_op_npu.cc
+++ b/paddle/fluid/operators/range_op_npu.cc
@@ -39,11 +39,23 @@ class RangeNPUKernel : public framework::OpKernel<T> {
     auto* out = context.Output<framework::Tensor>("Out");
 
     framework::Tensor n;
-    framework::TensorCopySync(*start_t, platform::CPUPlace(), &n);
+    framework::TensorCopy(
+        *start_t, platform::CPUPlace(),
+        context.template device_context<platform::DeviceContext>(), &n);
+    context.template device_context<paddle::platform::NPUDeviceContext>()
+        .Wait();
     T start = n.data<T>()[0];
-    framework::TensorCopySync(*end_t, platform::CPUPlace(), &n);
+    framework::TensorCopy(
+        *end_t, platform::CPUPlace(),
+        context.template device_context<platform::DeviceContext>(), &n);
+    context.template device_context<paddle::platform::NPUDeviceContext>()
+        .Wait();
     T end = n.data<T>()[0];
-    framework::TensorCopySync(*step_t, platform::CPUPlace(), &n);
+    framework::TensorCopy(
+        *step_t, platform::CPUPlace(),
+        context.template device_context<platform::DeviceContext>(), &n);
+    context.template device_context<paddle::platform::NPUDeviceContext>()
+        .Wait();
     T step = n.data<T>()[0];
 
     int64_t size = 0;
diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc
index 562a560b2f1548..f2f395314c0cc8 100644
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ b/paddle/fluid/operators/range_op_npu_test.cc
@@ -87,6 +87,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
 
 TEST(range, NPU) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<int>(&scope, ctx, "range");
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<int>(&scope, *ctx, "range");
 }
diff --git a/paddle/fluid/operators/read_file_op.cc b/paddle/fluid/operators/read_file_op.cc
new file mode 100644
index 00000000000000..6da92ed7df7d8e
--- /dev/null
+++ b/paddle/fluid/operators/read_file_op.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fstream>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CPUReadFileKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto filename = ctx.Attr<std::string>("filename");
+
+    std::ifstream input(filename.c_str(),
+                        std::ios::in | std::ios::binary | std::ios::ate);
+    std::streamsize file_size = input.tellg();
+
+    input.seekg(0, std::ios::beg);
+
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    std::vector<int64_t> out_shape = {file_size};
+    out->Resize(framework::make_ddim(out_shape));
+
+    uint8_t* data = out->mutable_data<T>(ctx.GetPlace());
+
+    input.read(reinterpret_cast<char*>(data), file_size);
+  }
+};
+
+class ReadFileOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Out) of ReadFileOp is null."));
+
+    auto out_dims = std::vector<int>(1, -1);
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::UINT8,
+                                   platform::CPUPlace());
+  }
+};
+
+class ReadFileOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("Out", "The output tensor of ReadFile op");
+    AddComment(R"DOC(
+This operator read a file.
+)DOC");
+    AddAttr<std::string>("filename", "Path of the file to be readed.")
+        .SetDefault({});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    read_file, ops::ReadFileOp, ops::ReadFileOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
+
+REGISTER_OP_CPU_KERNEL(read_file, ops::CPUReadFileKernel<uint8_t>)
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index b29493404f4536..f5d55791d86c68 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -53,9 +53,25 @@ BufferedReader::BufferedReader(
     stream_ = platform::CudaStreamResourcePool::Instance().New(dev_idx);
   }
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (platform::is_npu_place(place_)) {
+    int dev_idx = BOOST_GET_CONST(platform::NPUPlace, place_).device;
+    compute_stream_ =
+        ((platform::NPUDeviceContext *)(platform::DeviceContextPool::Instance()
+                                            .Get(place_)))
+            ->stream();
+    events_.resize(buffer_size);
+    for (auto &event : events_) {
+      event = platform::NpuEventResourcePool::Instance().New(dev_idx);
+    }
+    stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx);
+  }
+#endif
   is_same_place_ = false;
   cpu_buffer_.resize(buffer_size);
   cuda_buffer_.resize(buffer_size);
+  npu_buffer_.resize(buffer_size);
   ReadTillBufferFullAsync();
 }
 
@@ -196,7 +212,59 @@ void BufferedReader::ReadAsync(size_t i) {
 #endif
       }
     }
-#endif  // @} End Group GPU Place
+#endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (platform::is_npu_place(place_)) {
+      TensorVec &npu = npu_buffer_[i];
+      if (npu.empty()) {
+        npu.resize(cpu.size());
+      } else {
+        PADDLE_ENFORCE_EQ(
+            npu.size(), cpu.size(),
+            platform::errors::InvalidArgument(
+                "Input tensor number on NPU and CPU devices are not matched. "
+                "The number on NPU is %d, on CPU is %d",
+                npu.size(), cpu.size()));
+      }
+
+      std::vector<void *> npu_ptrs;
+      npu_ptrs.reserve(cpu.size());
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        npu[i].Resize(cpu[i].dims());
+        npu[i].set_layout(cpu[i].layout());
+        npu_ptrs.emplace_back(npu[i].mutable_data(place_, cpu[i].type()));
+      }
+
+      platform::SetNPUDeviceId(
+          BOOST_GET_CONST(platform::NPUPlace, place_).device);
+      PADDLE_ENFORCE_NPU_SUCCESS(
+          aclrtRecordEvent(events_[i].get(), compute_stream_));
+      PADDLE_ENFORCE_NPU_SUCCESS(
+          aclrtStreamWaitEvent(stream_.get(), events_[i].get()));
+
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy");
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        auto cpu_place = cpu[i].place();
+        auto cpu_ptr = cpu[i].data<void>();
+        auto npu_ptr = npu_ptrs[i];
+        auto size =
+            cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type());
+        if ((platform::is_npu_place(cpu_place))) {
+          memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place_), npu_ptr,
+                       BOOST_GET_CONST(platform::NPUPlace, cpu_place), cpu_ptr,
+                       size, stream_.get());
+        } else {
+          memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place_), npu_ptr,
+                       BOOST_GET_CONST(platform::CPUPlace, cpu_place), cpu_ptr,
+                       size, stream_.get());
+          PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_.get()));
+        }
+        npu[i].set_lod(cpu[i].lod());
+      }
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_.get()));
+    }
+#endif
     return i;
   }));
 }
@@ -228,9 +296,13 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
     return;
   }
 
-  *out = std::move((platform::is_gpu_place(place_) && !is_same_place_)
-                       ? cuda_buffer_[i]
-                       : cpu_buffer_[i]);
+  if (platform::is_gpu_place(place_) && !is_same_place_) {
+    *out = std::move(cuda_buffer_[i]);
+  } else if (platform::is_npu_place(place_) && !is_same_place_) {
+    *out = std::move(npu_buffer_[i]);
+  } else {
+    *out = std::move(cpu_buffer_[i]);
+  }
 
   // Do not push current position into ReadAsync. Push the previous position
   // Since all computation in fluid are async, change the data of
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index fbc46aceb81305..9f7b0e753281eb 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -25,7 +25,10 @@
 #include "paddle/fluid/platform/cuda_resource_pool.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
-
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/npu_info.h"
+#include "paddle/fluid/platform/npu_resource_pool.h"
+#endif
 namespace paddle {
 namespace operators {
 namespace reader {
@@ -67,12 +70,19 @@ class BufferedReader : public framework::DecoratedReader {
   bool is_same_place_;
   std::vector<TensorVec> cpu_buffer_;
   std::vector<TensorVec> cuda_buffer_;
+  std::vector<TensorVec> npu_buffer_;
   size_t prev_pos_{-1UL};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuStream_t compute_stream_;
   std::shared_ptr<platform::CudaStreamObject> stream_;
   std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  aclrtStream compute_stream_;
+  std::shared_ptr<platform::NpuStreamObject> stream_;
+  std::vector<std::shared_ptr<platform::NpuEventObject>> events_;
+#endif
 };
 
 }  // namespace reader
diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
index 98a68ca69cafd0..1aa93c80387e65 100644
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
@@ -68,7 +68,7 @@ TEST(BlockingQueue, SenderBlockingTest) {
       ++send_count;
     }
   });
-  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+  std::this_thread::sleep_for(std::chrono::milliseconds(1500));
   q.Close();
   sender.join();
   EXPECT_EQ(send_count, queue_cap);
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc
index a9eed0d7eb0427..dfba933940bd02 100644
--- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc
@@ -25,6 +25,32 @@ class ReduceMeanMKLDNNKernel : public ReduceMKLDNNKernel<T> {
   }
 };
 
+template <typename T>
+class ReduceMeanGradMKLDNNKernel : public ReduceGradMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const auto* input_x = ctx.Input<Tensor>("X");
+    auto input_dims = framework::vectorize(input_x->dims());
+    auto reduce_dims = ctx.Attr<std::vector<int>>("dim");
+
+    int number_of_elements = 1;
+    if (!ctx.Attr<bool>("reduce_all")) {
+      for (size_t i = 0; i < reduce_dims.size(); ++i) {
+        reduce_dims[i] = (reduce_dims[i] >= 0)
+                             ? reduce_dims[i]
+                             : input_dims.size() + reduce_dims[i];
+        number_of_elements *= input_dims[reduce_dims[i]];
+      }
+    } else {
+      number_of_elements = input_x->numel();
+    }
+
+    this->RunKernel(ctx, dnnl::algorithm::binary_add,
+                    dnnl::algorithm::reduction_mean, 0.0f,
+                    1.0L / number_of_elements);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -32,3 +58,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(reduce_mean, MKLDNN, paddle::platform::CPUPlace,
                    ops::ReduceMeanMKLDNNKernel<float>,
                    ops::ReduceMeanMKLDNNKernel<paddle::platform::bfloat16>);
+
+REGISTER_OP_KERNEL(reduce_mean_grad, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceMeanGradMKLDNNKernel<float>,
+                   ops::ReduceMeanGradMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
index 7e09aaa126effe..40cd3ba974f04c 100644
--- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h
@@ -21,6 +21,27 @@ using paddle::framework::LoDTensor;
 using paddle::framework::Tensor;
 using platform::to_void_cast;
 
+inline std::vector<int64_t> CalculateReducedDims(const Tensor* input,
+                                                 const Tensor* output,
+                                                 std::vector<int>& reduce_dims,
+                                                 bool reduce_all,
+                                                 bool keep_dim) {
+  if (keep_dim) return framework::vectorize(output->dims());
+
+  if (reduce_all)
+    return std::vector<int64_t>(framework::vectorize(input->dims()).size(), 1);
+
+  std::vector<int64_t> output_dims(framework::vectorize(input->dims()));
+  for (size_t i = 0; i < reduce_dims.size(); ++i) {
+    reduce_dims[i] = (reduce_dims[i] >= 0)
+                         ? reduce_dims[i]
+                         : input->dims().size() + reduce_dims[i];
+    output_dims[reduce_dims[i]] = 1;
+  }
+
+  return output_dims;
+}
+
 template <typename T>
 class ReduceMKLDNNKernel : public framework::OpKernel<T> {
  public:
@@ -37,9 +58,8 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
     bool reduce_all = ctx.Attr<bool>("reduce_all");
     bool keep_dim = ctx.Attr<bool>("keep_dim");
 
-    std::vector<int64_t> output_dims =
-        CalculateOutputDims(input, output, reduce_dims, reduce_all, keep_dim);
-
+    auto output_dims =
+        CalculateReducedDims(input, output, reduce_dims, reduce_all, keep_dim);
     auto input_dims = framework::vectorize(input->dims());
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
@@ -96,28 +116,98 @@ class ReduceMKLDNNKernel : public framework::OpKernel<T> {
               paddle::framework::vectorize<int64_t>(output->dims()))));
     }
   }
+};
+
+template <typename T>
+class ReduceGradMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void RunKernel(const framework::ExecutionContext& ctx,
+                 dnnl::algorithm binary_type, dnnl::algorithm reduction_type,
+                 float scale_x, float scale_y) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    bool keep_dim = ctx.Attr<bool>("keep_dim");
+    bool reduce_all = ctx.Attr<bool>("reduce_all");
+    auto dims = ctx.Attr<std::vector<int>>("dim");
+    auto* input_dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* output_dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    mkldnn::memory::format_tag x_format_tag;
+    auto input_dims =
+        CalculateReducedDims(output_dx, input_dy, dims, reduce_all, keep_dim);
+
+    if (input_dims != framework::vectorize(output_dx->dims())) {
+      const std::string key_pd =
+          platform::CreateKey(
+              dev_ctx, framework::vectorize(output_dx->dims()),
+              ctx.InputName("X"),
+              (std::to_string(static_cast<int>(reduction_type)))) +
+          "@fwd_pd";
+      std::shared_ptr<dnnl::reduction::primitive_desc> fwd_pd =
+          std::static_pointer_cast<dnnl::reduction::primitive_desc>(
+              dev_ctx.GetBlob(key_pd));
+
+      PADDLE_ENFORCE_NOT_NULL(
+          fwd_pd, platform::errors::Unavailable(
+                      "Forward primitive descriptor is not available in %s op, "
+                      "cannot deduce memory format tag",
+                      ctx.Type()));
+
+      x_format_tag = platform::GetMKLDNNFormat(fwd_pd->src_desc());
+
+      PADDLE_ENFORCE_NE(x_format_tag, mkldnn::memory::format_tag::undef,
+                        platform::errors::InvalidArgument(
+                            "Cannot deduce format tag for %s op", ctx.Type()));
+    } else {  // fwd descriptor not available because reorder was used instead
+              // of reduction
+      x_format_tag = getPlainFormatTag(output_dx);
+    }
+
+    output_dx->mutable_data<T>(ctx.GetPlace());
+    output_dx->set_format(x_format_tag);
+    output_dx->set_layout(input_dy->layout());
+
+    platform::BroadcastDataMKLDNNHandler<T> handler(
+        binary_type, dev_ctx, onednn_engine, ctx.GetPlace(), output_dx,
+        input_dy, scale_x, scale_y,
+        ctx.InputName(framework::GradVarName("Out")), input_dims);
+
+    const auto src_dx_memory = handler.AcquireSrcMemory(output_dx);
+    const auto src_dy_memory = handler.AcquireSecondSrcMemory(input_dy);
+    const auto binary_prim = handler.AcquireForwardPrimitive();
+
+    const std::unordered_map<int, dnnl::memory> args = {
+        {DNNL_ARG_SRC_0, *src_dx_memory},
+        {DNNL_ARG_SRC_1, *src_dy_memory},
+        {DNNL_ARG_DST, *src_dx_memory}};
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    binary_prim->execute(astream, args);
+    astream.wait();
+  }
 
- private:
-  std::vector<int64_t> CalculateOutputDims(const Tensor* input,
-                                           const Tensor* output,
-                                           std::vector<int>& reduce_dims,
-                                           bool reduce_all,
-                                           bool keep_dim) const {
-    if (keep_dim) return framework::vectorize(output->dims());
-
-    if (reduce_all)
-      return std::vector<int64_t>(framework::vectorize(input->dims()).size(),
-                                  1);
-
-    std::vector<int64_t> output_dims(framework::vectorize(input->dims()));
-    for (size_t i = 0; i < reduce_dims.size(); ++i) {
-      reduce_dims[i] = (reduce_dims[i] >= 0)
-                           ? reduce_dims[i]
-                           : input->dims().size() + reduce_dims[i];
-      output_dims[reduce_dims[i]] = 1;
+ protected:
+  mkldnn::memory::format_tag getPlainFormatTag(const Tensor* tensor) const {
+    auto tensor_dims_size = tensor->dims().size();
+    PADDLE_ENFORCE_EQ(
+        tensor_dims_size <= 5 && tensor_dims_size >= 1, true,
+        platform::errors::InvalidArgument(
+            "Dims for reduction_grad oneDNN op must be in range <1, 5>"));
+
+    switch (tensor_dims_size) {
+      case 1:
+        return mkldnn::memory::format_tag::a;
+      case 2:
+        return mkldnn::memory::format_tag::ab;
+      case 3:
+        return mkldnn::memory::format_tag::abc;
+      case 4:
+        return mkldnn::memory::format_tag::abcd;
     }
 
-    return output_dims;
+    return mkldnn::memory::format_tag::abcde;
   }
 };
 
diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc
index 4676589e68910a..3f92d39ede1ae8 100644
--- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc
@@ -25,6 +25,15 @@ class ReduceSumMKLDNNKernel : public ReduceMKLDNNKernel<T> {
   }
 };
 
+template <typename T>
+class ReduceSumGradMKLDNNKernel : public ReduceGradMKLDNNKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx, dnnl::algorithm::binary_add,
+                    dnnl::algorithm::reduction_sum, 0.0f, 1.0f);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -32,3 +41,7 @@ namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(reduce_sum, MKLDNN, paddle::platform::CPUPlace,
                    ops::ReduceSumMKLDNNKernel<float>,
                    ops::ReduceSumMKLDNNKernel<paddle::platform::bfloat16>);
+
+REGISTER_OP_KERNEL(reduce_sum_grad, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ReduceSumGradMKLDNNKernel<float>,
+                   ops::ReduceSumGradMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
index d408ff3988f030..1eeeb5e1f8aa19 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
@@ -78,6 +78,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(reduce_any, NPU) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<bool>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<bool>(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 280464ea852793..390c4d9709a60f 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -560,14 +560,28 @@ class ReduceGradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     int in_dtype = ctx.Attr<int>("in_dtype");
-    if (in_dtype >= 0) {
-      return framework::OpKernelType(
-          static_cast<framework::proto::VarType::Type>(in_dtype),
-          ctx.GetPlace());
+    auto input_data_type =
+        (in_dtype >= 0) ? static_cast<framework::proto::VarType::Type>(in_dtype)
+                        : OperatorWithKernel::IndicateVarDataType(
+                              ctx, framework::GradVarName("Out"));
+
+#ifdef PADDLE_WITH_MKLDNN
+    auto CanMKLDNNReduceGradBeUsed = [&]() {
+      auto dx_dims = ctx.Input<Tensor>("X")->dims();
+
+      if (dx_dims.size() > 5) return false;  // max 5D tensor is supported
+
+      return true;
+    };
+    if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&
+        CanMKLDNNReduceGradBeUsed()) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
     }
-    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
-                                       ctx, framework::GradVarName("Out")),
-                                   ctx.GetPlace());
+#endif
+
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
 
diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc
new file mode 100644
index 00000000000000..fb82d18e62f3bf
--- /dev/null
+++ b/paddle/fluid/operators/rnn_op_xpu.cc
@@ -0,0 +1,314 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+using TensorList = std::vector<framework::Tensor>;
+
+template <typename TensorType, typename T>
+void reset_parameter_vector(const std::vector<TensorType>& raw_params_vec,
+                            const int& num_layers, const bool& is_bidirec,
+                            std::vector<std::vector<T*>>* params_vec) {
+  // the parameter raw seuquence is [FWhi, FWhh, BWhi, BWhh] * num_layers
+  // + [FBhi, FBhh, BBhi, BBhh] * num_layers, we will reset the parameter to
+  // ([FWhi, FWhh, FBhi, FBhh] + [BWhi, BWhh, BBhi, BBhh]) * num_layers
+  const int& direction_num = is_bidirec ? 2 : 1;
+  const int& layer_weight_size = 4 * direction_num;
+  const int& all_weight_size = num_layers * layer_weight_size;
+  const int& bias_start_idx = all_weight_size / 2;
+  for (int i = 0; i < num_layers; i++) {
+    params_vec->at(i).resize(layer_weight_size);
+    for (int j = 0; j < layer_weight_size; j++) {
+      int k = j % 4;
+      const int& section = j / 4;
+      int tensor_idx = i * 2 * direction_num + section * 2 + k % 2;
+      if (k >= 2) {
+        tensor_idx += bias_start_idx;
+      }
+      using remove_cv_t = typename std::remove_cv<T>::type;
+      params_vec->at(i)[j] =
+          raw_params_vec[tensor_idx]->template data<remove_cv_t>();
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class RnnXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto pre_state = ctx.MultiInput<Tensor>("PreState");
+    auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+    auto state = ctx.MultiOutput<Tensor>("State");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* reserve_data = ctx.Output<Tensor>("Reserve");
+    const int& num_layers = ctx.Attr<int>("num_layers");
+    const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
+    const int& hidden_size = ctx.Attr<int>("hidden_size");
+    const std::string& mode = ctx.Attr<std::string>("mode");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    const Tensor* sequence_length = nullptr;
+    if (has_seq_length) {
+      sequence_length = ctx.Input<Tensor>("SequenceLength");
+    }
+
+    PADDLE_ENFORCE_EQ(
+        mode, "LSTM",
+        platform::errors::InvalidArgument(
+            "XPU only support LSTM mode now, current mode is %s", mode));
+
+    PADDLE_ENFORCE_EQ(is_bidirec, false,
+                      platform::errors::InvalidArgument(
+                          "XPU only support unidirectional LSTM now"));
+
+    PADDLE_ENFORCE_EQ(
+        num_layers, 1,
+        platform::errors::InvalidArgument(
+            "XPU only support 1 layer LSTM now, current layer num is %s",
+            num_layers));
+
+    auto init_h = pre_state[0];
+    auto init_c = pre_state[1];
+    auto last_h = state[0];
+    auto last_c = state[1];
+
+    // check shape
+    int seq_len = input->dims()[0];
+    int batch_size = input->dims()[1];
+    int input_dim = input->dims()[2];
+
+    PADDLE_ENFORCE_EQ(
+        init_h->dims()[0], num_layers,
+        platform::errors::InvalidArgument("The num_layers of in RNN layer must"
+                                          " be the same as first dim of init "
+                                          "hidden, but received num_layers:%d,"
+                                          " dim:%d",
+                                          num_layers, init_h->dims()[0]));
+
+    PADDLE_ENFORCE_EQ(
+        init_c->dims()[0], num_layers,
+        platform::errors::InvalidArgument(
+            "The num_layers of in RNN layer must"
+            " be the same as first dim of cell state hidden, but received"
+            " num_layers:%d, dim:%d",
+            num_layers, init_c->dims()[0]));
+
+    std::vector<std::vector<const T*>> parameter_lists;
+    parameter_lists.resize(num_layers);
+    reset_parameter_vector(weight_list, num_layers, is_bidirec,
+                           &parameter_lists);
+
+    // init the output and allocate the memory
+    output->mutable_data<T>(ctx.GetPlace());
+    last_h->mutable_data<T>(ctx.GetPlace());
+    last_c->mutable_data<T>(ctx.GetPlace());
+    reserve_data->Resize({seq_len * batch_size * hidden_size * 5});
+    reserve_data->mutable_data<T>(ctx.GetPlace());
+
+    // get ptr from tensor
+    auto x = input->data<T>();
+    auto h_0 = init_h->data<T>();
+    auto c_0 = init_c->data<T>();
+    auto w_x = parameter_lists[0][0];
+    auto w_h = parameter_lists[0][1];
+    auto b_x = parameter_lists[0][2];
+    auto b_h = parameter_lists[0][3];
+    auto y = output->data<T>();
+    auto last_h_ptr = last_h->data<T>();
+    auto last_c_ptr = last_c->data<T>();
+    auto i_f_g_o = reserve_data->data<T>();
+    auto c = i_f_g_o + seq_len * batch_size * hidden_size * 4;
+
+    std::vector<int> seq_len_tensor(batch_size, seq_len);
+    if (has_seq_length) {
+      seq_len_tensor = operators::GetDataFromTensor(sequence_length);
+    }
+
+    // run kernel
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::lstm_train<T, T, int16_t>(
+        dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0,
+        (const T*)w_x, (const T*)w_h, (const T*)b_x, (const T*)b_h,
+        reinterpret_cast<T*>(y), reinterpret_cast<T*>(last_h_ptr),
+        reinterpret_cast<T*>(last_c_ptr), batch_size, input_dim, hidden_size,
+        seq_len, seq_len_tensor, nullptr, nullptr, nullptr, nullptr,
+        reinterpret_cast<T*>(i_f_g_o), reinterpret_cast<T*>(c));
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::External("RnnXPU(lstm) return wrong "
+                                                 "value[%d %s]",
+                                                 r, XPUAPIErrorMsg[r]));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class RnnXPUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // get the tensor pointer for the input
+    auto* input = ctx.Input<Tensor>("Input");
+    auto pre_state = ctx.MultiInput<Tensor>("PreState");
+    auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+    auto* output = ctx.Input<Tensor>("Out");
+    auto* reserve_data = ctx.Input<Tensor>("Reserve");
+    const int& num_layers = ctx.Attr<int>("num_layers");
+    const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
+    const int& hidden_size = ctx.Attr<int>("hidden_size");
+    const std::string& mode = ctx.Attr<std::string>("mode");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    const Tensor* sequence_length = nullptr;
+    if (has_seq_length) {
+      sequence_length = ctx.Input<Tensor>("SequenceLength");
+    }
+
+    PADDLE_ENFORCE_EQ(
+        mode, "LSTM",
+        platform::errors::InvalidArgument(
+            "XPU only support LSTM mode now, current mode is %s", mode));
+
+    PADDLE_ENFORCE_EQ(is_bidirec, false,
+                      platform::errors::InvalidArgument(
+                          "XPU only support unidirectional LSTM now"));
+
+    PADDLE_ENFORCE_EQ(
+        num_layers, 1,
+        platform::errors::InvalidArgument(
+            "XPU only support 1 layer LSTM now, current layer num is %s",
+            num_layers));
+
+    auto init_h = pre_state[0];
+    auto init_c = pre_state[1];
+
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto state_grad = ctx.MultiInput<Tensor>(framework::GradVarName("State"));
+    auto last_h_grad = state_grad[0];
+    auto last_c_grad = state_grad[1];
+
+    // get the tensor pointer for the output
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto weight_grad_list = ctx.MultiOutput<framework::Tensor>(
+        framework::GradVarName("WeightList"));
+    auto pre_state_grad =
+        ctx.MultiOutput<Tensor>(framework::GradVarName("PreState"));
+    Tensor* init_h_grad = nullptr;
+    Tensor* init_c_grad = nullptr;
+    if (pre_state_grad.size() > 0) {  // has gradient
+      init_h_grad = pre_state_grad[0];
+      init_c_grad = pre_state_grad[1];
+    }
+
+    // check shape
+    int seq_len = input->dims()[0];
+    int batch_size = input->dims()[1];
+    int input_dim = input->dims()[2];
+
+    PADDLE_ENFORCE_EQ(
+        init_h->dims()[0], num_layers,
+        platform::errors::InvalidArgument("The num_layers of in RNN layer must"
+                                          " be the same as first dim of init "
+                                          "hidden, but received num_layers:%d,"
+                                          " dim:%d",
+                                          num_layers, init_h->dims()[0]));
+
+    PADDLE_ENFORCE_EQ(
+        init_c->dims()[0], num_layers,
+        platform::errors::InvalidArgument(
+            "The num_layers of in RNN layer must"
+            " be the same as first dim of cell state hidden, but received"
+            " num_layers:%d, dim:%d",
+            num_layers, init_c->dims()[0]));
+
+    std::vector<std::vector<const T*>> parameter_lists;
+    parameter_lists.resize(num_layers);
+    reset_parameter_vector(weight_list, num_layers, is_bidirec,
+                           &parameter_lists);
+
+    for (unsigned int i = 0; i < weight_grad_list.size(); ++i) {
+      weight_grad_list[i]->mutable_data<T>(ctx.GetPlace());
+    }
+    std::vector<std::vector<T*>> parameter_lists_grad;
+    parameter_lists_grad.resize(num_layers);
+    reset_parameter_vector(weight_grad_list, num_layers, is_bidirec,
+                           &parameter_lists_grad);
+
+    // allocate the memory and initization the input_grad
+    input_grad->mutable_data<T>(input->dims(), ctx.GetPlace());
+    if (init_h_grad) {
+      init_h_grad->mutable_data<T>(init_h->dims(), ctx.GetPlace());
+    }
+    if (init_c_grad) {
+      init_c_grad->mutable_data<T>(init_c->dims(), ctx.GetPlace());
+    }
+
+    // get ptr from tensor
+    auto x = input->data<T>();
+    auto h_0 = init_h->data<T>();
+    auto c_0 = init_c->data<T>();
+    auto w_x = parameter_lists[0][0];
+    auto w_h = parameter_lists[0][1];
+    auto y = output->data<T>();
+    auto y_grad = output_grad->data<T>();
+    auto last_h_grad_ptr = last_h_grad->data<T>();
+    auto last_c_grad_ptr = last_c_grad->data<T>();
+    auto x_grad = input_grad->data<T>();
+    auto h_0_grad = init_h_grad ? init_h_grad->data<T>() : nullptr;
+    auto c_0_grad = init_c_grad ? init_c_grad->data<T>() : nullptr;
+    auto w_x_grad = parameter_lists_grad[0][0];
+    auto w_h_grad = parameter_lists_grad[0][1];
+    auto b_x_grad = parameter_lists_grad[0][2];
+    auto b_h_grad = parameter_lists_grad[0][3];
+    auto i_f_g_o = reserve_data->data<T>();
+    auto c = i_f_g_o + seq_len * batch_size * hidden_size * 4;
+
+    std::vector<int> seq_len_tensor(batch_size, seq_len);
+    if (has_seq_length) {
+      seq_len_tensor = operators::GetDataFromTensor(sequence_length);
+    }
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::lstm_grad<T, T, int16_t>(
+        dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0,
+        (const T*)w_x, (const T*)w_h, (const T*)y, (const T*)y_grad,
+        (const T*)last_h_grad_ptr, (const T*)last_c_grad_ptr,
+        reinterpret_cast<T*>(x_grad), reinterpret_cast<T*>(h_0_grad),
+        reinterpret_cast<T*>(c_0_grad), w_x_grad, w_h_grad, b_x_grad, b_h_grad,
+        batch_size, input_dim, hidden_size, seq_len, seq_len_tensor, nullptr,
+        nullptr, nullptr, nullptr, i_f_g_o, c);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("RnnXPUGrad(lstm) return wrong "
+                                   "value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    rnn, ops::RnnXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    rnn_grad, ops::RnnXPUGradKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/save_combine_op_npu.cc b/paddle/fluid/operators/save_combine_op_npu.cc
new file mode 100644
index 00000000000000..1fb136a5110dbd
--- /dev/null
+++ b/paddle/fluid/operators/save_combine_op_npu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/save_combine_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    save_combine,
+    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::SaveCombineOpKernel<paddle::platform::NPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/save_op_npu.cc b/paddle/fluid/operators/save_op_npu.cc
new file mode 100644
index 00000000000000..90db1a0bb85d60
--- /dev/null
+++ b/paddle/fluid/operators/save_op_npu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/save_op.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    save, ops::SaveOpKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, uint8_t>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int8_t>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::SaveOpKernel<paddle::platform::NPUDeviceContext,
+                      paddle::platform::float16>);
diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc
index ee7210a7784d72..cbfd11834ae477 100644
--- a/paddle/fluid/operators/scale_op_npu.cc
+++ b/paddle/fluid/operators/scale_op_npu.cc
@@ -34,6 +34,8 @@ class ScaleNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
     float _power = 1.0;
+    VLOG(4) << "scale:" << scale << ", bias:" << bias
+            << " ,bias_after_scale:" << bias_after_scale;
     if (bias_after_scale) {
       out->mutable_data<T>(ctx.GetPlace());
       auto runner =
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc
index 144e7ceae20c16..2d23e81717abb8 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.cc
+++ b/paddle/fluid/operators/scatter_nd_add_op.cc
@@ -50,10 +50,15 @@ class ScatterNdAddOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_LE(
         index_dims[index_dims_size - 1], ref_dims_size,
         platform::errors::InvalidArgument(
-            "Input(Index).shape[-1] should be no greater than Input(X).rank"));
+            "The last dimension of Input(Index)'s shape should be no greater "
+            "than the rank of Input(X), but received the last dimension of "
+            "Input(Index)'s shape is %d, the rank of Input(X) is %d.",
+            index_dims[index_dims_size - 1], ref_dims_size));
     PADDLE_ENFORCE_GE(index_dims_size, 2UL,
                       platform::errors::InvalidArgument(
-                          "The rank of Input(Index) should be greater than 1"));
+                          "The rank of Input(Index) should be greater than 1, "
+                          "but received the rank of Input(Index) is %d.",
+                          index_dims_size));
 
     // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
     std::vector<int64_t> r_updates_dims;
@@ -66,12 +71,21 @@ class ScatterNdAddOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_EQ(
         r_updates_dims.size(), updates_dims_size,
-        platform::errors::InvalidArgument("Updates has wrong shape"));
+        platform::errors::InvalidArgument(
+            "Updates has wrong shape. The shape of Updates and Input(Updates) "
+            "should be same, but received the shape of Updates is %d, "
+            "the shape of Input(Updates) is %d.",
+            r_updates_dims.size(), updates_dims_size));
 
     for (int64_t i = 0; i < updates_dims_size; ++i) {
       PADDLE_ENFORCE_EQ(
           r_updates_dims[i], updates_dims[i],
-          platform::errors::InvalidArgument("Updates has wrong shape"));
+          platform::errors::InvalidArgument(
+              "Updates has wrong shape. The dimensions of Updates and "
+              "Input(Updates) should match, but received Updates's"
+              "%d-th dimension is %d, Input(Updates)'s %d-th "
+              "dimension is %d.",
+              i, r_updates_dims[i], i, updates_dims[i]));
     }
     ctx->SetOutputDim("Out", ref_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 3fc40d41c30817..f0faa0c5798339 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -41,15 +41,24 @@ class ScatterOp : public framework::OperatorWithKernel {
     auto ref_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(
         ctx->GetInputDim("Ids").size(), 1,
-        platform::errors::InvalidArgument("Update Ids should be 1-D."));
+        platform::errors::InvalidArgument(
+            "The size of Input(Ids)'s shape should be equal to 1, but "
+            "received the rank of Input(Ids) is %d.",
+            ctx->GetInputDim("Ids").size()));
     PADDLE_ENFORCE_EQ(
         ref_dims.size(), updates_dims.size(),
         platform::errors::InvalidArgument(
-            "Rerence and Updates should have the same shape size."));
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
-                      ctx->GetInputDim("Ids")[0],
-                      platform::errors::InvalidArgument(
-                          "Updates and Ids should have same batch-size."));
+            "Input(X) and Input(Updates) should have the same shape size, "
+            "but received the size of Input(x)'s shape is %d, the size of "
+            "Input(Updates)'s shape is %d.",
+            ref_dims.size(), updates_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0],
+        platform::errors::InvalidArgument(
+            "Input(Updates) and Input(Ids) should have same batch-size, but"
+            " received Input(Updates)'s batch-size is %d, Input(Ids)'s "
+            "batch-size is %d.",
+            ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0]));
     ctx->SetOutputDim("Out", ref_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 105d61015fcb9d..96a132ac6abc21 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -146,22 +146,75 @@ Assignment to a Tensor in static mode.
 )DOC");
   }
 };
+
+template <typename T>
+class SetValueGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    if (this->HasInput("ValueTensor")) {
+      op->SetType("slice");
+      op->SetInput("Input", this->OutputGrad("Out"));
+      if (this->HasInput("StartsTensorList")) {
+        op->SetInput("StartsTensorList", this->Input("StartsTensorList"));
+      }
+      if (this->HasInput("EndsTensorList")) {
+        op->SetInput("EndsTensorList", this->Input("EndsTensorList"));
+      }
+
+      // convert std::vector<int64_t > to std::vector<int >
+      std::vector<int64_t> axes_int64 = static_cast<std::vector<int64_t>>(
+          BOOST_GET_CONST(std::vector<int64_t>, this->GetAttr("axes")));
+      std::vector<int64_t> starts_int64 = static_cast<std::vector<int64_t>>(
+          BOOST_GET_CONST(std::vector<int64_t>, this->GetAttr("starts")));
+      std::vector<int64_t> ends_int64 = static_cast<std::vector<int64_t>>(
+          BOOST_GET_CONST(std::vector<int64_t>, this->GetAttr("ends")));
+      std::vector<int64_t> decrease_axes_int64 =
+          static_cast<std::vector<int64_t>>(BOOST_GET_CONST(
+              std::vector<int64_t>, this->GetAttr("decrease_axes")));
+
+      std::vector<int> axes(axes_int64.begin(), axes_int64.end());
+      std::vector<int> starts(starts_int64.begin(), starts_int64.end());
+      std::vector<int> ends(ends_int64.begin(), ends_int64.end());
+      std::vector<int> decrease_axes(decrease_axes_int64.begin(),
+                                     decrease_axes_int64.end());
+
+      op->SetAttr("axes", axes);
+      op->SetAttr("starts", starts);
+      op->SetAttr("ends", ends);
+      op->SetAttr("decrease_axis", decrease_axes);
+      op->SetAttr("infer_flags", std::vector<int>({}));
+
+      op->SetOutput("Out", this->InputGrad("ValueTensor"));
+    } else {
+      op->SetType("assign");
+      op->SetInput("X", this->OutputGrad("Out"));
+      op->SetOutput("Out", this->InputGrad("Input"));
+    }
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(SetValueOpInplaceInferer, {"Input", "Out"});
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
-REGISTER_OPERATOR(
-    set_value, ops::SetValue, ops::SetValueMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker,
+                  ops::SetValueGradMaker<paddle::framework::OpDesc>,
+                  ops::SetValueGradMaker<paddle::imperative::OpBase>,
+                  ops::SetValueOpInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(
     set_value, ops::SetValueKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SetValueKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::SetValueKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SetValueKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SetValueKernel<paddle::platform::CPUDeviceContext, bool>);
+    ops::SetValueKernel<plat::CPUDeviceContext, int64_t>,
+    ops::SetValueKernel<plat::CPUDeviceContext, float>,
+    ops::SetValueKernel<plat::CPUDeviceContext, double>,
+    ops::SetValueKernel<plat::CPUDeviceContext, bool>);
 
 REGISTER_OP_VERSION(set_value)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
index 9c30c4e07fa774..22f6fa9e3e6f20 100644
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -259,7 +259,20 @@ class SliceKernel : public framework::OpKernel<T> {
     auto out_t =
         framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
             *out, new_out_dims);
-    out_t.device(place) = in_t.slice(offsets, extents);
+
+    if (in->numel() <= Eigen::NumTraits<int>::highest()) {
+      // similar to tf.slice:
+      // if element number less than INT_MAX, change the type of index to int
+      Eigen::DSizes<int, D> offsets_32bit, extents_32bit;
+      for (size_t i = 0; i < D; i++) {
+        offsets_32bit[i] = offsets[i];
+        extents_32bit[i] = extents[i];
+      }
+      framework::To32BitIndex(out_t).device(place) =
+          framework::To32BitIndex(in_t).slice(offsets_32bit, extents_32bit);
+    } else {
+      out_t.device(place) = in_t.slice(offsets, extents);
+    }
 
     out->Resize(out_dims);
   }
@@ -300,8 +313,6 @@ class SliceGradKernel : public framework::OpKernel<T> {
  private:
   template <size_t D>
   void SliceCompute(const framework::ExecutionContext& context) const {
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
     auto axes = context.Attr<std::vector<int>>("axes");
 
     auto starts_int = context.Attr<std::vector<int>>("starts");
@@ -435,13 +446,189 @@ class SliceGradKernel : public framework::OpKernel<T> {
       paddings[i].first = offsets[i];
       paddings[i].second = (in_dims[i] - out_dims[i]) - offsets[i];
     }
+    EigenPaddingCompute(context, d_input, in_dims, d_out, out_dims, paddings);
+  }
+
+  template <size_t D>
+  void EigenPaddingCompute(
+      const framework::ExecutionContext& context, framework::Tensor* d_input,
+      const framework::DDim& in_dims, const framework::Tensor* d_out,
+      const framework::DDim& out_dims,
+      const Eigen::array<std::pair<int64_t, int64_t>, D>& paddings) const {
+    if (D <= 3) {
+      // if dimension less than 3, cannot reduce dimension
+      LaunchEigenPadding(context, d_input, in_dims, d_out, out_dims, paddings);
+    } else {  // else we can reduce dimension
+      // count not-zero padding number, and record the dimension
+      int need_pad_num = 0, pad_dim = -1;
+      for (size_t i = 0; i < D; i++) {
+        if (paddings[i].first != 0 || paddings[i].second != 0) {
+          need_pad_num++;
+          pad_dim = i;
+        }
+      }
+
+      if (need_pad_num == 0) {
+        // do not need padding, pass if data address same, else copy
+        if (d_input->mutable_data<T>(context.GetPlace()) == d_out->data<T>()) {
+          // inplace, do not any operator, pass
+        } else {
+          framework::TensorCopy(
+              *d_out, context.GetPlace(),
+              context.template device_context<platform::DeviceContext>(),
+              d_input);
+        }
+      } else if (need_pad_num == 1) {
+        // only need padding one dimension, we can reduce dimension.
+        // only the padding dimension is available for us.
+        // How to reduce dimension(5 to 3 for example):
+        // before(D=5):
+        // in_dims:        [x1,  x2,  x3,  x4,  x5]
+        // padding.first:  [0,   0,   a,   0,  0]
+        // padding.second: [0,   0,   b,   0,  0]
+        //                     | |
+        //                     V V
+        // after(D=3):
+        // reshaped_in_dims:        [x1*x2,  x3,  x4*x5]
+        // reshaped_padding.first:  [0,      a,     0]
+        // reshaped_padding.second: [0,      b,     0]
+
+        if (pad_dim == D - 1) {
+          // only last dimension need padding,
+          // reshape the dimension of tensor in 2: [preceding, padding]
+          std::vector<int64_t> in_tore_shape(2, 1), out_tore_shape(2, 1);
+          Eigen::array<std::pair<int64_t, int64_t>, 2> reshaped_padding;
+
+          // first dimension is the accumulate of preceding dimension
+          for (int i = 0; i < pad_dim; i++) {
+            in_tore_shape[0] *= in_dims[i];
+            out_tore_shape[0] *= out_dims[i];
+          }
+          // second dimension is the padding dimension
+          in_tore_shape[1] = in_dims[pad_dim];
+          out_tore_shape[1] = out_dims[pad_dim];
+
+          // convert array from std::vector to DDim
+          framework::DDim reshaped_in_dims =
+              framework::make_ddim(in_tore_shape);
+          framework::DDim reshaped_out_dims =
+              framework::make_ddim(out_tore_shape);
+
+          // after reshape: the first dimension do not need padding,
+          // set padding[0] zero
+          reshaped_padding[0].first = reshaped_padding[0].second = 0;
+          // the second dimension is the previous padding dimension
+          reshaped_padding[1].first = paddings[pad_dim].first;
+          reshaped_padding[1].second = paddings[pad_dim].second;
+
+          LaunchEigenPadding(context, d_input, reshaped_in_dims, d_out,
+                             reshaped_out_dims, reshaped_padding);
+        } else if (pad_dim == 0) {
+          // only first dimension need padding,
+          // reshape the dimension of tensor in 2: [padding, succeeding]
+          // similar to (D - 1)
+          std::vector<int64_t> in_tore_shape(2, 1), out_tore_shape(2, 1);
+          Eigen::array<std::pair<int64_t, int64_t>, 2> reshaped_padding;
+
+          // first dimension is the padding dimension
+          in_tore_shape[0] = in_dims[pad_dim];
+          out_tore_shape[0] = out_dims[pad_dim];
+          // sencond dimension is the accumulate of succeeding dimension
+          for (size_t i = pad_dim + 1; i < D; i++) {
+            in_tore_shape[1] *= in_dims[i];
+            out_tore_shape[1] *= out_dims[i];
+          }
+
+          // convert array from std::vector to DDim
+          framework::DDim reshaped_in_dims =
+              framework::make_ddim(in_tore_shape);
+          framework::DDim reshaped_out_dims =
+              framework::make_ddim(out_tore_shape);
+
+          // after reshape:
+          // the first dimension is the previous padding dimension
+          reshaped_padding[0].first = paddings[pad_dim].first;
+          reshaped_padding[0].second = paddings[pad_dim].second;
+          // the second dimension do not need padding, set padding[1] zero
+          reshaped_padding[1].first = reshaped_padding[1].second = 0;
+
+          LaunchEigenPadding(context, d_input, reshaped_in_dims, d_out,
+                             reshaped_out_dims, reshaped_padding);
+        } else {
+          // other dimension need padding
+          // reshape the dimension of tensor in 3:
+          // [preceding, padding, succeeding]
+          std::vector<int64_t> in_tore_shape(3, 1), out_tore_shape(3, 1);
+          Eigen::array<std::pair<int64_t, int64_t>, 3> reshaped_padding;
+
+          // first dimension is the accumulate of preceding dimension
+          for (int i = 0; i < pad_dim; i++) {
+            in_tore_shape[0] *= in_dims[i];
+            out_tore_shape[0] *= out_dims[i];
+          }
+          // second dimension is the padding dimension
+          in_tore_shape[1] = in_dims[pad_dim];
+          out_tore_shape[1] = out_dims[pad_dim];
+          // third dimension is the accumulate of succeeding dimension
+          for (size_t i = pad_dim + 1; i < D; i++) {
+            in_tore_shape[2] *= in_dims[i];
+            out_tore_shape[2] *= out_dims[i];
+          }
+
+          // convert array from std::vector to DDim
+          framework::DDim reshaped_in_dims =
+              framework::make_ddim(in_tore_shape);
+          framework::DDim reshaped_out_dims =
+              framework::make_ddim(out_tore_shape);
+
+          // after reshape:
+          // the first dimension do not need padding, set padding[0] zero
+          reshaped_padding[0].first = reshaped_padding[2].second = 0;
+          // the second dimension is the previous padding dimension
+          reshaped_padding[1].first = paddings[pad_dim].first;
+          reshaped_padding[1].second = paddings[pad_dim].second;
+          // the third dimension do not need padding, set padding[2] zero
+          reshaped_padding[2].first = reshaped_padding[2].second = 0;
+
+          LaunchEigenPadding(context, d_input, reshaped_in_dims, d_out,
+                             reshaped_out_dims, reshaped_padding);
+        }
+      } else {
+        // need padding at many dimension, cannot reduce dimension
+        LaunchEigenPadding(context, d_input, in_dims, d_out, out_dims,
+                           paddings);
+      }
+    }
+  }
+
+  template <size_t D>
+  void LaunchEigenPadding(
+      const framework::ExecutionContext& context, framework::Tensor* d_input,
+      const framework::DDim& in_dims, const framework::Tensor* d_out,
+      const framework::DDim& out_dims,
+      const Eigen::array<std::pair<int64_t, int64_t>, D>& paddings) const {
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     auto d_in_t =
         framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *d_input);
+            *d_input, in_dims);
     auto d_out_t =
         framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
             *d_out, out_dims);
-    d_in_t.device(place) = d_out_t.pad(paddings, T(0));
+
+    if (d_input->numel() <= Eigen::NumTraits<int>::highest()) {
+      // similar to tf.pad:
+      // if element number less than INT_MAX, change the type of index to int
+      Eigen::array<std::pair<int, int>, D> paddings_32bit;
+      for (size_t i = 0; i < D; i++) {
+        paddings_32bit[i] =
+            std::make_pair(paddings[i].first, paddings[i].second);
+      }
+      framework::To32BitIndex(d_in_t).device(place) =
+          framework::To32BitIndex(d_out_t).pad(paddings_32bit, T(0));
+    } else {
+      d_in_t.device(place) = d_out_t.pad(paddings, T(0));
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
index e5e0dafdae0b15..9974536da9acb4 100644
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -124,11 +124,13 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_NPU_KERNEL(
     slice, ops::SliceNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SliceNPUKernel<paddle::platform::NPUDeviceContext, int>,
     ops::SliceNPUKernel<paddle::platform::NPUDeviceContext,
                         paddle::platform::float16>);
 
 REGISTER_OP_NPU_KERNEL(
     slice_grad,
     ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
     ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext,
                             paddle::platform::float16>);
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
index f06f59f3b4e005..d20b3ac04bf95c 100644
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
@@ -159,12 +159,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(softmax, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
 }
 
 TEST(softmax_grad, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index e58b39252ce5f4..fbaf76d4e7cd89 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -55,7 +55,7 @@ class SoftmaxWithCrossEntropyOpMaker
         "the given labels as soft labels.")
         .SetDefault(false);
     AddAttr<bool>(
-        "softmax_switch",
+        "use_softmax",
         "(bool, default: true), A flag to indicate whether to do softmax ")
         .SetDefault(true);
     AddAttr<bool>(
@@ -320,7 +320,6 @@ REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
 REGISTER_OP_VERSION(softmax_with_cross_entropy)
     .AddCheckpoint(
         R"ROC(
-              Add a new attribute [softmax_switch] )ROC",
+              Add a new attribute [use_softmax] )ROC",
         paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "softmax_switch", "A flag to indicate whether to do softmax",
-            true));
+            "use_softmax", "A flag to indicate whether to do softmax", true));
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 140059256c3cc9..4aec4c17422792 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -772,10 +772,10 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
         platform::is_gpu_place(context.GetPlace()), true,
         platform::errors::Unavailable("softmax_with_cross_entropy operator's "
                                       "CUDA kernel only runs on GPU device."));
-    const bool softmax_switch = context.Attr<bool>("softmax_switch");
+    const bool use_softmax = context.Attr<bool>("use_softmax");
 
     // do not with softmax op, and input is softmax
-    if (!softmax_switch) {
+    if (!use_softmax) {
       const Tensor* softmax = context.Input<Tensor>("Logits");
       const Tensor* labels = context.Input<Tensor>("Label");
       Tensor* softmax_out = context.Output<Tensor>("Softmax");
@@ -925,10 +925,10 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
     int block = 512;
     auto stream = context.cuda_device_context().stream();
     auto ignore_index = context.Attr<int>("ignore_index");
-    auto softmax_switch = context.Attr<bool>("softmax_switch");
+    auto use_softmax = context.Attr<bool>("use_softmax");
 
     // do not with softmax op, and input is softmax
-    if (!softmax_switch) {
+    if (!use_softmax) {
       if (context.Attr<bool>("soft_label")) {
         int grid = (n * d + block - 1) / block;
         const T* label_data = labels->data<T>();
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
index 55b811cbe31e40..74316841a13b17 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -31,10 +31,10 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         platform::is_cpu_place(context.GetPlace()), true,
         platform::errors::Unimplemented("This kernel only runs on CPU."));
-    const bool softmax_switch = context.Attr<bool>("softmax_switch");
+    const bool use_softmax = context.Attr<bool>("use_softmax");
 
     // do not with softmax op, and input is softmax
-    if (!softmax_switch) {
+    if (!use_softmax) {
       const Tensor* softmax = context.Input<Tensor>("Logits");
       const Tensor* labels = context.Input<Tensor>("Label");
       Tensor* softmax_out = context.Output<Tensor>("Softmax");
@@ -113,9 +113,9 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
         context.Output<Tensor>(framework::GradVarName("Logits"));
 
     const Tensor* softmax = context.Input<Tensor>("Softmax");
-    const bool softmax_switch = context.Attr<bool>("softmax_switch");
+    const bool use_softmax = context.Attr<bool>("use_softmax");
 
-    if (logit_grad != softmax || !softmax_switch) {
+    if (logit_grad != softmax || !use_softmax) {
       framework::TensorCopy(*softmax, context.GetPlace(),
                             context.device_context(), logit_grad);
     }
@@ -138,8 +138,8 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     auto logit_grad_mat = framework::EigenMatrix<T>::From(logit_grad_2d);
     auto& place = *context.template device_context<platform::CPUDeviceContext>()
                        .eigen_device();
-    if (!softmax_switch) {
-      // softmax_switch step1
+    if (!use_softmax) {
+      // use_softmax step1
       if (soft_label) {
         auto lbl_mat = framework::EigenMatrix<T>::From(labels_2d);
         logit_grad_mat.device(place) =
@@ -148,7 +148,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
             out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, axis_dim)) *
             logit_grad_mat;
       }
-      // softmax_switch step2
+      // use_softmax step2
       else {
         const int64_t* label_data = labels->data<int64_t>();
         T* logit_grad_data = logit_grad->data<T>();
@@ -181,7 +181,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
       return;
     }
 
-    // for softmax_switch=False, continue
+    // for use_softmax=False, continue
 
     if (soft_label) {
       // when soft_label = True, ignore_index is not supported
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
index c777a02f96bd9a..a34946315f5a81 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
@@ -67,12 +67,10 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
     // on and off
     Tensor on_tensor(framework::proto::VarType::INT32);
     on_tensor.mutable_data<int>({1}, ctx.GetPlace());
-    TensorFromVector(std::vector<int>{static_cast<int>(1)},
-                     ctx.device_context(), &on_tensor);
+    FillNpuTensorWithConstant<int>(&on_tensor, static_cast<int>(1));
     Tensor off_tensor(framework::proto::VarType::INT32);
     off_tensor.mutable_data<int>({1}, ctx.GetPlace());
-    TensorFromVector(std::vector<int>{static_cast<int>(0)},
-                     ctx.device_context(), &off_tensor);
+    FillNpuTensorWithConstant<int>(&off_tensor, static_cast<int>(0));
 
     // one_hot
     Tensor tmp_onehot(on_tensor.type());
@@ -142,12 +140,10 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
     // on and off
     Tensor on_tensor(framework::proto::VarType::INT32);
     on_tensor.mutable_data<int>({1}, ctx.GetPlace());
-    TensorFromVector(std::vector<int>{static_cast<int>(1)},
-                     ctx.device_context(), &on_tensor);
+    FillNpuTensorWithConstant<int>(&on_tensor, static_cast<int>(1));
     Tensor off_tensor(framework::proto::VarType::INT32);
     off_tensor.mutable_data<int>({1}, ctx.GetPlace());
-    TensorFromVector(std::vector<int>{static_cast<int>(0)},
-                     ctx.device_context(), &off_tensor);
+    FillNpuTensorWithConstant<int>(&off_tensor, static_cast<int>(0));
 
     // one_hot
     Tensor tmp_onehot(on_tensor.type());
diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc
index 22dc81cbd79e0e..1de7ca8c7bdbf4 100644
--- a/paddle/fluid/operators/squeeze_op_npu_test.cc
+++ b/paddle/fluid/operators/squeeze_op_npu_test.cc
@@ -85,6 +85,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(squeeze, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 741f86f35848b2..0f520adba57a20 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -326,4 +326,6 @@ REGISTER_OP_CPU_KERNEL(
     sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
     ops::SumKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext,
+                   paddle::platform::bfloat16>,
     ops::SumKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
index 620231eb2e2984..eb20e1c2cd2748 100644
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -250,8 +250,12 @@ class LoDTensorArray2TensorGradOp : public framework::OperatorBase {
     auto dout_name = Input(framework::GradVarName("Out"));
 
     std::vector<std::string> grad_names;
+    // NOTE(Aurelius84): Generating grad base name by Input("X") instead of
+    // fixed string to avoid incorrectly sharing same var's allocation in
+    // multi-thread that will cause wrong calculation result.
+    std::string grad_base_name = base_name + "_temp_grad_";
 
-    LodTensorVectorResizeFromLodTensorArray(scope, "grad_name", Input("X"),
+    LodTensorVectorResizeFromLodTensorArray(scope, grad_base_name, Input("X"),
                                             &grad_names);
 
     auto use_stack = Attr<bool>("use_stack");
diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/fluid/operators/tensor_formatter.cc
index 046ae90ec7c6e0..f1b64f042c3c09 100644
--- a/paddle/fluid/operators/tensor_formatter.cc
+++ b/paddle/fluid/operators/tensor_formatter.cc
@@ -125,6 +125,11 @@ void TensorFormatter::FormatData(const framework::LoDTensor& print_tensor,
     framework::LoDTensor cpu_tensor;
     platform::CPUPlace cpu_place;
     TensorCopy(print_tensor, cpu_place, &cpu_tensor);
+#ifdef PADDLE_WITH_ASCEND_CL
+    if (platform::is_npu_place(print_tensor.place())) {
+      platform::DeviceContextPool::Instance().Get(print_tensor.place())->Wait();
+    }
+#endif
     data = cpu_tensor.data<T>();
   }
 
diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc
index 6527362bb96907..b98e620cc2d342 100644
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
@@ -286,3 +286,20 @@ REGISTER_OP_CPU_KERNEL(
     ops::TileGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::TileGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::TileGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+REGISTER_OP_CUDA_KERNEL(
+    tile, ops::TileKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext,
+                    paddle::platform::float16>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::TileKernel<paddle::platform::CUDADeviceContext, bool>);
+REGISTER_OP_CUDA_KERNEL(
+    tile_grad, ops::TileGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::float16>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
+#endif
diff --git a/paddle/fluid/operators/tile_op.cu b/paddle/fluid/operators/tile_op.cu
deleted file mode 100644
index 5ca82cd6a1f435..00000000000000
--- a/paddle/fluid/operators/tile_op.cu
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/tile_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    tile, ops::TileKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TileKernel<paddle::platform::CUDADeviceContext, bool>);
-REGISTER_OP_CUDA_KERNEL(
-    tile_grad, ops::TileGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::TileGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/trace_op.cu b/paddle/fluid/operators/trace_op.cu
index ea328361ded75a..2c2745018be402 100644
--- a/paddle/fluid/operators/trace_op.cu
+++ b/paddle/fluid/operators/trace_op.cu
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
 #include "paddle/fluid/operators/trace_op.h"
 
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
index 36f7a695358511..f6712814e1e3b8 100644
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -126,12 +126,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(transpose2, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
 }
 
 TEST(transpose2_grad, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu
index 798709b1088d3f..1f25a880758923 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
@@ -12,25 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
 #include <limits>
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/truncated_gaussian_random_op.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename T>
-struct TruncatedNormal {
+struct GPUTruncatedNormal {
   T mean, std;
   T a_normal_cdf;
   T b_normal_cdf;
   unsigned int seed;
   T numeric_min;
 
-  __host__ __device__ TruncatedNormal(T mean, T std, T numeric_min, int seed)
+  __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed)
       : mean(mean), std(std), seed(seed), numeric_min(numeric_min) {
     a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0;
     b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0;
@@ -110,10 +113,10 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
           TruncatedNormalOffset<T>(mean, std, std::numeric_limits<T>::min(),
                                    seed_offset.first, gen_offset));
     } else {
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          TruncatedNormal<T>(mean, std, std::numeric_limits<T>::min(), seed));
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(data),
+                        GPUTruncatedNormal<T>(
+                            mean, std, std::numeric_limits<T>::min(), seed));
     }
   }
 };
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
index 4253187fdde74d..7f3190d9112c66 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
@@ -35,28 +35,24 @@ class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> {
     float mean = ctx.Attr<float>("mean");
     Tensor mean_tensor(framework::proto::VarType::FP32);
     mean_tensor.mutable_data<float>({1}, ctx.GetPlace());
-    TensorFromVector(std::vector<float>{mean}, ctx.device_context(),
-                     &mean_tensor);
+    FillNpuTensorWithConstant<float>(&mean_tensor, mean);
 
     float std = ctx.Attr<float>("std");
     Tensor std_tensor(framework::proto::VarType::FP32);
     std_tensor.mutable_data<float>({1}, ctx.GetPlace());
-    TensorFromVector(std::vector<float>{std}, ctx.device_context(),
-                     &std_tensor);
+    FillNpuTensorWithConstant<float>(&std_tensor, std);
 
     int32_t seed_var = ctx.Attr<int32_t>("seed");
 
     Tensor min_tensor(framework::proto::VarType::FP32);
     min_tensor.mutable_data<float>({1}, ctx.GetPlace());
     float min_value = mean - std * 2.0;
-    TensorFromVector(std::vector<float>{min_value}, ctx.device_context(),
-                     &min_tensor);
+    FillNpuTensorWithConstant<float>(&min_tensor, min_value);
 
     Tensor max_tensor(framework::proto::VarType::FP32);
     max_tensor.mutable_data<float>({1}, ctx.GetPlace());
     float max_value = mean + std * 2.0;
-    TensorFromVector(std::vector<float>{max_value}, ctx.device_context(),
-                     &max_tensor);
+    FillNpuTensorWithConstant<float>(&max_tensor, max_value);
 
     auto* out = ctx.Output<framework::Tensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 6efada4343ca54..007276b16d7f2e 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -18,10 +18,41 @@ limitations under the License. */
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/bfloat16.h"
 
 namespace paddle {
 namespace operators {
 
+namespace {
+template <typename T>
+inline void UniformRealDistribution(T *data, const int64_t &size,
+                                    const float &min, const float &max,
+                                    const unsigned int &seed) {
+  VLOG(4) << "[CPU] UniformRandomKernel<T>";
+  std::uniform_real_distribution<T> dist(static_cast<T>(min),
+                                         static_cast<T>(max));
+  auto engine = paddle::framework::GetCPURandomEngine(seed);
+
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = dist(*engine);
+  }
+}
+
+template <>
+inline void UniformRealDistribution(paddle::platform::bfloat16 *data,
+                                    const int64_t &size, const float &min,
+                                    const float &max,
+                                    const unsigned int &seed) {
+  VLOG(4) << "[CPU] UniformRandomKernel<bfloat16>";
+  std::uniform_real_distribution<float> dist(min, max);
+  auto engine = paddle::framework::GetCPURandomEngine(seed);
+
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = static_cast<paddle::platform::bfloat16>(dist(*engine));
+  }
+}
+}  // namespace
+
 // It seems that Eigen::Tensor::random in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
@@ -61,17 +92,11 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
           framework::ToTypeName(out_var->Type())));
     }
     T *data = tensor->mutable_data<T>(ctx.GetPlace());
-
     int64_t size = tensor->numel();
-    std::uniform_real_distribution<T> dist(
-        static_cast<T>(ctx.Attr<float>("min")),
-        static_cast<T>(ctx.Attr<float>("max")));
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    auto engine = framework::GetCPURandomEngine(seed);
 
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(*engine);
-    }
+    UniformRealDistribution<T>(
+        data, size, ctx.Attr<float>("min"), ctx.Attr<float>("max"),
+        static_cast<unsigned int>(ctx.Attr<int>("seed")));
 
     unsigned int diag_num =
         static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
@@ -257,9 +282,12 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     paddle::operators::UniformRandomOpVarTypeInference);
 
-REGISTER_OP_CPU_KERNEL(uniform_random,
-                       paddle::operators::CPUUniformRandomKernel<float>,
-                       paddle::operators::CPUUniformRandomKernel<double>);
-REGISTER_OP_CPU_KERNEL(uniform_random_batch_size_like,
-                       paddle::operators::CPUUniformRandomKernel<float>,
-                       paddle::operators::CPUUniformRandomKernel<double>);
+REGISTER_OP_CPU_KERNEL(
+    uniform_random, paddle::operators::CPUUniformRandomKernel<float>,
+    paddle::operators::CPUUniformRandomKernel<double>,
+    paddle::operators::CPUUniformRandomKernel<paddle::platform::bfloat16>);
+REGISTER_OP_CPU_KERNEL(
+    uniform_random_batch_size_like,
+    paddle::operators::CPUUniformRandomKernel<float>,
+    paddle::operators::CPUUniformRandomKernel<double>,
+    paddle::operators::CPUUniformRandomKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 563a6c165b7485..ceb13a3dda41df 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -11,9 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
-
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 6052e533643f3c..18a4154be30ac7 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -24,9 +24,9 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 inline std::vector<int64_t> GetNewDataFromShapeTensor(
-    const Tensor *new_data_tensor) {
+    const Tensor* new_data_tensor) {
   if (new_data_tensor->type() == framework::proto::VarType::INT64) {
-    auto *new_data = new_data_tensor->data<int64_t>();
+    auto* new_data = new_data_tensor->data<int64_t>();
     framework::Tensor cpu_starts_tensor;
     if (platform::is_gpu_place(new_data_tensor->place())) {
       TensorCopySync(*new_data_tensor, platform::CPUPlace(),
@@ -37,7 +37,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensor(
                                       new_data + new_data_tensor->numel());
     return vec_new_data;
   } else if (new_data_tensor->type() == framework::proto::VarType::INT32) {
-    auto *new_data = new_data_tensor->data<int32_t>();
+    auto* new_data = new_data_tensor->data<int32_t>();
     std::vector<int64_t> vec_new_data;
     framework::Tensor cpu_starts_tensor;
     if (platform::is_gpu_place(new_data_tensor->place())) {
@@ -58,7 +58,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensor(
 }
 
 inline std::vector<int64_t> GetNewDataFromShapeTensorList(
-    const std::vector<const Tensor *> &list_new_shape_tensor) {
+    const std::vector<const Tensor*>& list_new_shape_tensor) {
   std::vector<int64_t> vec_new_shape;
   vec_new_shape.reserve(list_new_shape_tensor.size());
   for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
@@ -97,6 +97,5 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
 
   return vec_new_shape;
 }
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
index 9b4485047f05c1..a145c914a8621b 100644
--- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc
+++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
@@ -85,6 +85,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(unsqueeze, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/where_index_op.cu b/paddle/fluid/operators/where_index_op.cu
index bb968743585f7d..b1cd172923ee6d 100644
--- a/paddle/fluid/operators/where_index_op.cu
+++ b/paddle/fluid/operators/where_index_op.cu
@@ -12,7 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thrust/device_vector.h>
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include <algorithm>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/where_index_op.h"
@@ -25,52 +33,124 @@ namespace operators {
 using CUDADeviceContext = paddle::platform::CUDADeviceContext;
 
 template <typename T>
-class CUDAWhereIndexKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    // TODO(zhoukunsheng): Should optimize to ensure GPU is faster than CPU.
-    framework::Tensor cond_cpu;
-    framework::TensorCopy(*condition, platform::CPUPlace(), &cond_cpu);
-
-    const T* cond_data = cond_cpu.data<T>();
-    int64_t numel = cond_cpu.numel();
-    auto dims = cond_cpu.dims();
-    int rank = dims.size();
-
-    thrust::host_vector<int64_t> h_true_index;
-    for (int64_t i = 0; i < numel; i++) {
-      if (static_cast<bool>(cond_data[i])) {
-        h_true_index.push_back(i);
+__global__ void GetTrueNum(const T *cond_data, const int64_t numel,
+                           int64_t *true_num_array) {
+  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
+    true_num_array[idx] =
+        static_cast<int64_t>(static_cast<bool>(cond_data[idx]));
+  }
+}
+
+template <typename T>
+__global__ void SetTrueIndex(int64_t *out_ptr, const T *cond_data,
+                             const int64_t numel, const int64_t *stride_array,
+                             const int64_t rank,
+                             const int64_t *true_num_array) {
+  const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) {
+    // true_num_array is calculated by cub::InclusiveSum,
+    // cause the first element of true_num_array is 1,
+    // so we need substract 1 to get true index.
+    const int64_t true_index = true_num_array[idx] - 1;
+    if (static_cast<bool>(cond_data[idx])) {
+      int64_t rank_index = idx;
+      for (int j = 0; j < rank; j++) {
+        const int64_t out_index = rank_index / stride_array[j];
+        out_ptr[true_index * rank + j] = out_index;
+        rank_index -= out_index * stride_array[j];
       }
     }
-    thrust::device_vector<int64_t> d_true_index = h_true_index;
-    int64_t* ptr_true_index = thrust::raw_pointer_cast(d_true_index.data());
-
-    size_t true_num = h_true_index.size();
+  }
+}
 
+template <typename T>
+class CUDAWhereIndexKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *condition = context.Input<framework::Tensor>("Condition");
+    auto *out = context.Output<framework::Tensor>("Out");
+    auto &dev_ctx = context.template device_context<CUDADeviceContext>();
+
+    const T *cond_data = condition->data<T>();
+    const int64_t numel = condition->numel();
+    auto dims = condition->dims();
+    const int rank = dims.size();
+
+    auto d_array_mem = memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t));
+    auto h_array_mem =
+        memory::Alloc(platform::CPUPlace(), (rank + 1) * sizeof(int64_t));
+
+    // "stride_array" is an array and len(stride_array)==rank,
+    // each element is the stride of each dimension -- the length from i to i+1.
+    int64_t *h_stride_array = reinterpret_cast<int64_t *>(h_array_mem->ptr());
+    int64_t *d_stride_array = reinterpret_cast<int64_t *>(d_array_mem->ptr());
+
+    // "true_num_array" is an array and len(stride_array)==numel,
+    // at the beginning,
+    // "true_num_array" will set 1 if condition[i] == true else 0,
+    // then it will be calculated by cub::InclusiveSum,
+    // so that we can get the true number before i as the out index
+    int64_t *d_true_num_array = d_stride_array + rank;
+
+    // the total_true_num is the total number of condition[i] == true
+    int64_t *h_total_true_num = h_stride_array + rank;
+
+    // alloce cub memory
+    size_t cub_size = 0;
+    cub::DeviceScan::InclusiveSum(nullptr, cub_size, d_true_num_array,
+                                  d_true_num_array, numel, dev_ctx.stream());
+    auto cub_mem = memory::Alloc(dev_ctx, cub_size * sizeof(int64_t));
+    void *cub_data = cub_mem->ptr();
+
+    // set d_true_num_array[i]=1 if cond_data[i]==true else 0
+    const int threads = std::min(numel, static_cast<int64_t>(128));
+    const int64_t need_grids = (numel + threads - 1) / threads;
+    const int grids = std::min(need_grids, static_cast<int64_t>(256));
+    GetTrueNum<T><<<grids, threads, 0, dev_ctx.stream()>>>(cond_data, numel,
+                                                           d_true_num_array);
+
+    // calculate the inclusive prefix sum of "true_num_array"
+    // to get the index of "out" tensor,
+    // and the total number of cond_data[i]==true.
+    // Example:
+    // condition: F T T F F F T T
+    // before:    0 1 1 0 0 0 1 1
+    // after:     0 1 2 2 2 2 3 4
+    // out:       1 2 6 7
+    cub::DeviceScan::InclusiveSum(cub_data, cub_size, d_true_num_array,
+                                  d_true_num_array, numel, dev_ctx.stream());
+
+    // calculate each dimension's stride
+    h_stride_array[rank - 1] = 1;
+    for (int i = rank - 2; i >= 0; i--) {
+      h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1];
+    }
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 d_stride_array, platform::CPUPlace(), h_stride_array,
+                 rank * sizeof(int64_t), dev_ctx.stream());
+
+    // get total ture number and set output size
+    // the last element of cub::InclusiveSum is the total number
+    memory::Copy(platform::CPUPlace(), h_total_true_num,
+                 BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
+                 d_true_num_array + numel - 1, sizeof(int64_t),
+                 dev_ctx.stream());
+    dev_ctx.Wait();
+
+    int64_t true_num = *h_total_true_num;
     out->Resize(framework::make_ddim({static_cast<int64_t>(true_num), rank}));
-    auto out_ptr = out->mutable_data<int64_t>(context.GetPlace());
+    auto out_data = out->mutable_data<int64_t>(context.GetPlace());
 
     if (true_num == 0) {
       return;
     }
 
-    thrust::host_vector<int64_t> h_stride(rank, 0);
-    h_stride[rank - 1] = 1;
-    for (int i = rank - 2; i >= 0; i--) {
-      h_stride[i] = h_stride[i + 1] * dims[i + 1];
-    }
-    thrust::device_vector<int64_t> d_stride = h_stride;
-    int64_t* ptr_stride = thrust::raw_pointer_cast(d_stride.data());
-
-    auto& dev_ctx = context.template device_context<CUDADeviceContext>();
-    WhereIndexFunctor<int64_t> functor(ptr_true_index, true_num, ptr_stride,
-                                       rank, out_ptr);
-    platform::ForRange<CUDADeviceContext> for_range(dev_ctx, true_num);
-    for_range(functor);
+    // using true_num_array and stride_array to calculate the output index
+    SetTrueIndex<T><<<grids, threads, 0, dev_ctx.stream()>>>(
+        out_data, cond_data, numel, d_stride_array, rank, d_true_num_array);
   }
 };
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 584dbd4756aa09..0827d6a5ae7644 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -106,11 +106,11 @@ ELSE()
 ENDIF()
 
 IF(WITH_ASCEND_CL)
-cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
+cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
 ENDIF()
 
 IF(WITH_GPU)
-    nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
+    nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
 ENDIF()
 IF(WITH_ROCM)
     hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
@@ -136,13 +136,18 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool
     place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
     ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
 
-cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto  device_context enforce)
+cc_library(collective_helper SRCS collective_helper.cc collective_helper_npu.cc gen_comm_id_helper.cc DEPS framework_proto  device_context enforce)
 
 if(WITH_GPU OR WITH_ROCM)
     cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
     target_link_libraries(device_context cuda_resource_pool)
 endif()
 
+if(WITH_ASCEND_CL)
+    cc_library(npu_resource_pool SRCS npu_resource_pool.cc DEPS npu_info)
+    target_link_libraries(device_context npu_resource_pool)
+endif()
+
 cc_test(init_test SRCS init_test.cc DEPS device_context)
 
 if(WITH_GPU)
@@ -185,6 +190,7 @@ cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
 
 IF(WITH_GPU)
   nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
+  nv_test(bfloat16_gpu_test SRCS bfloat16_test.cu DEPS lod_tensor)
   nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
   nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
 ENDIF()
diff --git a/paddle/fluid/platform/ascend_npu_info.h b/paddle/fluid/platform/ascend_npu_info.h
index 7afed121a5acb6..213013f5b12777 100644
--- a/paddle/fluid/platform/ascend_npu_info.h
+++ b/paddle/fluid/platform/ascend_npu_info.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h
index 6cb4901f1dde32..a362e2903f2456 100644
--- a/paddle/fluid/platform/bfloat16.h
+++ b/paddle/fluid/platform/bfloat16.h
@@ -21,6 +21,15 @@
 #include <iostream>
 #include <limits>
 
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif
+
+#if defined(__CUDACC__) && CUDA_VERSION >= 11000
+#define PADDLE_CUDA_BF16
+#include <cuda_bf16.h>
+#endif
+
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
@@ -44,6 +53,7 @@ struct PADDLE_ALIGN(2) bfloat16 {
  public:
   uint16_t x;
 
+  // Constructors
   bfloat16() = default;
   bfloat16(const bfloat16& o) = default;
   bfloat16& operator=(const bfloat16& o) = default;
@@ -60,15 +70,34 @@ struct PADDLE_ALIGN(2) bfloat16 {
     tempRes = reinterpret_cast<uint32_t*>(&val);
     res = *tempRes;
     x = res >> 16;
+#else
+#if defined(PADDLE_CUDA_BF16)
+    __nv_bfloat16 tmp = __float2bfloat16(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
 #else
     std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
 #endif
+#endif
+  }
+
+#if defined(PADDLE_CUDA_BF16)
+  HOSTDEVICE inline explicit bfloat16(const __nv_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);
   }
+#endif
 
   template <class T>
   HOSTDEVICE inline explicit bfloat16(const T& val)
       : x(bfloat16(static_cast<float>(val)).x) {}
 
+// Assignment operators
+#if defined(PADDLE_CUDA_BF16)
+  HOSTDEVICE inline bfloat16& operator=(const __nv_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);
+    return *this;
+  }
+#endif
+
   HOSTDEVICE inline bfloat16& operator=(bool b) {
     x = b ? 0x3f80 : 0;
     return *this;
@@ -124,13 +153,24 @@ struct PADDLE_ALIGN(2) bfloat16 {
     return *this;
   }
 
+  // Conversion opertors
   HOSTDEVICE inline explicit operator float() const {
+#ifdef PADDLE_CUDA_BF16
+    return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#else
     float val = 0.f;
     uint16_t temp = x;
     memcpy(reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp),
            2);
     return val;
+#endif
+  }
+
+#ifdef PADDLE_CUDA_BF16
+  HOSTDEVICE inline explicit operator __nv_bfloat16() const {
+    return *reinterpret_cast<const __nv_bfloat16*>(&x);
   }
+#endif
 
   HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
 
@@ -223,6 +263,7 @@ HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) {
   return res;
 }
 
+// Comparison operators
 HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) {
   return static_cast<float>(a) == static_cast<float>(b);
 }
diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu
new file mode 100644
index 00000000000000..dbbb72920a53b0
--- /dev/null
+++ b/paddle/fluid/platform/bfloat16_test.cu
@@ -0,0 +1,124 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/bfloat16.h"
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <iostream>
+#include "paddle/fluid/framework/lod_tensor.h"
+
+#if defined(PADDLE_CUDA_BF16)
+namespace paddle {
+namespace platform {
+
+TEST(bfloat16, convert_float32_to_bfloat16_on_gpu) {
+  // Convert float32 to bfloat16
+  EXPECT_EQ((bfloat16(1.0f)).x, 0x3f80);
+  EXPECT_EQ((bfloat16(0.5f)).x, 0x3f00);
+  EXPECT_EQ((bfloat16(0.33333f)).x, 0x3eab);
+  EXPECT_EQ((bfloat16(0.0f)).x, 0x0000);
+  EXPECT_EQ((bfloat16(-0.0f)).x, 0x8000);
+  EXPECT_EQ((bfloat16(65536.0f)).x, 0x4780);
+}
+
+TEST(bfloat16, assignment_operator_on_gpu) {
+  // Assignment operator
+  bfloat16 v_assign;
+  v_assign = nv_bfloat16(bfloat16(1.0f));
+  EXPECT_EQ(v_assign.x, 0x3f80);
+  v_assign = 0.33333;
+  EXPECT_EQ(v_assign.x, 0x3eab);
+}
+
+TEST(bfloat16, convert_bfloat16_to_float32_on_gpu) {
+  // Conversion operator
+  EXPECT_EQ(static_cast<float>(bfloat16(0.5f)), 0.5f);
+  EXPECT_NEAR(static_cast<double>(bfloat16(0.33333)), 0.33333, 0.01);
+  EXPECT_EQ(static_cast<int>(bfloat16(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(bfloat16(true)), true);
+}
+
+TEST(bfloat16, lod_tensor_on_gpu) {
+  framework::LoDTensor src_tensor;
+  framework::LoDTensor gpu_tensor;
+  framework::LoDTensor dst_tensor;
+
+  bfloat16 *src_ptr = src_tensor.mutable_data<bfloat16>(
+      framework::make_ddim({2, 2}), CPUPlace());
+
+  bfloat16 arr[4] = {bfloat16(1.0f), bfloat16(0.5f), bfloat16(0.33333f),
+                     bfloat16(0.0f)};
+  memcpy(src_ptr, arr, 4 * sizeof(bfloat16));
+
+  // CPU LoDTensor to GPU LoDTensor
+  CUDAPlace gpu_place(0);
+  CUDADeviceContext gpu_ctx(gpu_place);
+  framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
+
+  // GPU LoDTensor to CPU LoDTensor
+  framework::TensorCopy(gpu_tensor, CPUPlace(), gpu_ctx, &dst_tensor);
+
+  // Sync before comparing LoDTensors
+  gpu_ctx.Wait();
+  const bfloat16 *dst_ptr = dst_tensor.data<bfloat16>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 4; ++i) {
+    EXPECT_EQ(src_ptr[i].x, dst_ptr[i].x);
+  }
+}
+
+TEST(bfloat16, isinf) {
+  bfloat16 a;
+  a.x = 0x7f80;
+  bfloat16 b = bfloat16(INFINITY);
+  bfloat16 c = static_cast<bfloat16>(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(c), true);
+}
+
+TEST(bfloat16, isnan) {
+  bfloat16 a;
+  a.x = 0x7fff;
+  bfloat16 b = bfloat16(NAN);
+  bfloat16 c = static_cast<bfloat16>(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(c), true);
+}
+
+TEST(bfloat16, cast) {
+  bfloat16 a;
+  a.x = 0x0070;
+  auto b = a;
+  {
+    // change semantic, keep the same value
+    bfloat16 c = reinterpret_cast<bfloat16 &>(reinterpret_cast<unsigned &>(b));
+    EXPECT_EQ(b, c);
+  }
+
+  {
+    // use uint32 low 16 bit store float16
+    uint32_t c = reinterpret_cast<uint32_t &>(b);
+    bfloat16 d;
+    d.x = c;
+    EXPECT_EQ(b, d);
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
index 197f905ba68a29..b0b857f7ee3f2a 100644
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -22,6 +22,7 @@
 #include "boost/variant.hpp"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/dynload/hccl.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -126,6 +127,113 @@ class NCCLCommContext {
 };
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+// In order to apply hierarchical communication with HCCL, we need
+// a communication ring contains HCCL communicators associated to a global
+// HCCLUniqueId. E.g. for a hierarchical case,
+//
+//    11 - 12   21 - 22
+//     |    |    |    |
+//    13 - 14 - 23 - 24
+//          |    |
+//    31 - 32 - 41 - 42
+//     |    |    |    |
+//    33 - 34   43 - 44
+//
+// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
+// (31,32,33,34), (41,42,43,44) as bottoms respectively.
+//
+// We could also use a single communication ring for the flatten case
+//
+// The HCCLComm instance is created and reversed in the HCCLCommContext
+// singleton with a global user specified group id.
+class NPUDeviceContext;
+
+#define ENV_RANK_TABLE_FILE "RANK_TABLE_FILE"
+#define ENV_RANK_ID "PADDLE_TRAINER_ID"
+
+class HCCLComm {
+ public:
+  virtual int ring_id() const = 0;
+  virtual int nranks() const = 0;
+  virtual int rank() const = 0;
+  virtual int device_id() const = 0;
+  virtual HcclComm comm() const = 0;
+  virtual aclrtStream stream() const = 0;
+  virtual NPUDeviceContext* dev_context() const = 0;
+  virtual ~HCCLComm() = default;
+};
+
+// A singleton HCCL communicator context reserves communication ring ids
+class HCCLCommContext {
+ public:
+  static HCCLCommContext& Instance() {
+    static HCCLCommContext comm_ctx;
+    return comm_ctx;
+  }
+
+  HCCLComm* CreateHCCLComm(HcclRootInfo* hccl_id, int nranks, int rank,
+                           int dev_id, int ring_id);
+  // a latter comm with the same dev_id and the same ring_id
+  // will override the former
+  HCCLComm* AssignHCCLComm(HcclComm comm, int nranks, int rank, int dev_id,
+                           int ring_id);
+
+  // retrieve a communicator by the ring id in multiprocessing mode
+  HCCLComm* Get(int ring_id) const {
+    PADDLE_ENFORCE_GT(
+        comm_map_.count(ring_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator in ring id %d has not been initialized.", ring_id));
+    PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(), 1,
+                      platform::errors::InvalidArgument(
+                          "One device id should be specified to retrieve from "
+                          "multiple communicators."));
+    return comm_map_.at(ring_id).begin()->second.get();
+  }
+
+  // retrieve a communicator by the ring id and the device id
+  HCCLComm* Get(int ring_id, int dev_id) const {
+    PADDLE_ENFORCE_GT(
+        comm_map_.count(ring_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator of ring id %d has not been initialized.", ring_id));
+    PADDLE_ENFORCE_GT(
+        comm_map_.at(ring_id).count(dev_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator at device id %d has not been initialized in ring %d.",
+            dev_id, ring_id));
+    return comm_map_.at(ring_id).at(dev_id).get();
+  }
+
+  // retrieve a communicator by the ring id and place
+  HCCLComm* Get(int ring_id, Place place) const {
+    return Get(ring_id, BOOST_GET_CONST(NPUPlace, place).device);
+  }
+
+ private:
+  // Init global hcom
+  HCCLCommContext() {}
+  // we may use group feature in the feature
+  // HCCLCommContext() { InitHcomWorldGroup(); }
+
+  HcclComm comm_;
+
+ public:
+  ~HCCLCommContext() {}
+
+  std::once_flag once_flag_;
+  std::mutex comm_map_mutex_;
+  // ring id to dev-HCCLComm
+  std::map<int, std::map<int, std::unique_ptr<HCCLComm>>> comm_map_;
+
+  // void InitHcomWorldGroup();
+  void ReleaseHCCLComms();
+
+  DISABLE_COPY_AND_ASSIGN(HCCLCommContext);
+};
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 // In order to apply hierarchical communication with BKCL, we need
 // a communication ring contains BKCL communicators associated to a global
diff --git a/paddle/fluid/platform/collective_helper_npu.cc b/paddle/fluid/platform/collective_helper_npu.cc
new file mode 100644
index 00000000000000..f30e5fa833d44d
--- /dev/null
+++ b/paddle/fluid/platform/collective_helper_npu.cc
@@ -0,0 +1,145 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include <utility>
+
+namespace paddle {
+namespace platform {
+
+class HCCLCommImpl : public HCCLComm {
+ public:
+  void set_ring_id(int ring_id) { ring_id_ = ring_id; }
+  int ring_id() const override { return ring_id_; }
+
+  void set_nranks(int nranks) { nranks_ = nranks; }
+  int nranks() const override { return nranks_; }
+
+  void set_rank(int rank) { rank_ = rank; }
+  int rank() const override { return rank_; }
+
+  int device_id() const override {
+    return BOOST_GET_CONST(NPUPlace, dev_ctx_->GetPlace()).device;
+  }
+
+  ~HCCLCommImpl() {
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommDestroy(comm_));
+  }
+
+  void set_comm(HcclComm comm) { comm_ = comm; }
+  HcclComm comm() const override { return comm_; }
+
+  aclrtStream stream() const override { return dev_ctx_->stream(); }
+
+  void set_dev_ctx(std::unique_ptr<NPUDeviceContext>&& dev_ctx) {
+    dev_ctx_ = std::move(dev_ctx);
+  }
+  NPUDeviceContext* dev_context() const override { return dev_ctx_.get(); }
+
+ private:
+  int ring_id_;
+  int nranks_;
+  int rank_;
+  HcclComm comm_;
+  std::unique_ptr<NPUDeviceContext> dev_ctx_;
+};
+
+HCCLComm* HCCLCommContext::CreateHCCLComm(HcclRootInfo* hccl_id, int nranks,
+                                          int rank, int dev_id, int ring_id) {
+  PADDLE_ENFORCE_NOT_NULL(hccl_id,
+                          platform::errors::InvalidArgument(
+                              "The hccl unique id should not be null."));
+  PADDLE_ENFORCE_GT(
+      nranks, 1,
+      platform::errors::InvalidArgument(
+          "Expected nranks > 1. But received nranks is %d.", nranks));
+  PADDLE_ENFORCE_GE(rank, 0,
+                    platform::errors::InvalidArgument(
+                        "Expected rank >= 0. But received rank is %d.", rank));
+  PADDLE_ENFORCE_LT(
+      rank, nranks,
+      platform::errors::InvalidArgument(
+          "Expected rank < nranks. But received rank is %d, nranks is %d.",
+          rank, nranks));
+  PADDLE_ENFORCE_GE(
+      dev_id, 0,
+      platform::errors::InvalidArgument(
+          "Expected dev_id >= 0. But received dev_id is %d.", dev_id));
+
+  HcclComm comm;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(dev_id));
+  VLOG(1) << "initialized comm: " << &comm << ", nranks: " << nranks
+          << ", hccl_id: " << hccl_id << ", rank: " << rank;
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      platform::dynload::HcclCommInitRootInfo(nranks, hccl_id, rank, &comm));
+
+  VLOG(1) << "initialized comm: " << &comm << ", nranks: " << nranks
+          << ", hccl_id: " << hccl_id << ", rank: " << rank;
+
+  auto* comm_wrapper = AssignHCCLComm(comm, nranks, rank, dev_id, ring_id);
+
+  VLOG(1) << "hccl communicator of rank " << rank << " in ring " << ring_id
+          << " has been created on device " << dev_id
+          << ", with comm: " << comm_wrapper->comm();
+
+  std::call_once(once_flag_, []() {
+    std::atexit([]() { HCCLCommContext::Instance().ReleaseHCCLComms(); });
+  });
+
+  return comm_wrapper;
+}
+
+HCCLComm* HCCLCommContext::AssignHCCLComm(HcclComm comm, int nranks, int rank,
+                                          int dev_id, int ring_id) {
+  std::unique_ptr<NPUDeviceContext> dev_ctx(
+      new NPUDeviceContext(NPUPlace(dev_id)));
+
+  HCCLCommImpl* c = new HCCLCommImpl;
+  c->set_ring_id(ring_id);
+  c->set_nranks(nranks);
+  c->set_rank(rank);
+  c->set_comm(comm);
+  c->set_dev_ctx(std::move(dev_ctx));
+
+  comm_map_mutex_.lock();
+  if (comm_map_.count(ring_id) == 0) {
+    comm_map_.emplace(ring_id, std::map<int, std::unique_ptr<HCCLComm>>());
+  }
+  auto& dev2comm = comm_map_[ring_id];
+
+  dev2comm.emplace(dev_id, std::unique_ptr<HCCLComm>(c));
+  comm_map_mutex_.unlock();
+
+  if (ring_id == 0) {
+    auto* dev_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(
+            platform::NPUPlace(dev_id)));
+    dev_ctx->set_hccl_comm(comm);
+  }
+
+  return comm_map_[ring_id][dev_id].get();
+}
+
+void HCCLCommContext::ReleaseHCCLComms() {
+  for (auto& p : comm_map_) {
+    for (auto& q : p.second) {
+      q.second.reset();
+    }
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/denormal.cc b/paddle/fluid/platform/denormal.cc
index 35e9804e2a3081..4af156d1577dd9 100644
--- a/paddle/fluid/platform/denormal.cc
+++ b/paddle/fluid/platform/denormal.cc
@@ -28,7 +28,7 @@
 #endif
 
 #if !defined(GCC_WITHOUT_INTRINSICS) && !defined(PADDLE_WITH_ARM) && \
-    !defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS)
+    !defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS) && !defined(_WIN32)
 #define DENORM_USE_INTRINSICS
 #endif
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index a0ade3898c336b..9a47ac45462ed7 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
-
 #include "glog/logging.h"
+#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace memory {
@@ -254,8 +254,9 @@ NPUDeviceContext::~NPUDeviceContext() {
 }
 
 void NPUDeviceContext::Wait() const {
-  NPUDeviceGuard guard(place_.device);
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
+  platform::RecordEvent record_event("NPUDeviceContext/wait");
+  VLOG(4) << "NPU context(" << this << ")  Wait";
+  stream_->Wait();
 }
 
 aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); }
@@ -536,6 +537,7 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; }
 MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
     : CPUDeviceContext(place), p_blobmap_() {
   p_blobmap_.reset(new BlobMap());
+  p_exec_items_.reset(new ExecMap());
   p_mutex_.reset(new std::mutex());
 }
 
@@ -559,7 +561,7 @@ MKLDNNDeviceContextThreadLocals::Body::~Body() {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   platform::MKLDNNDeviceContext* dev_ctx =
       (platform::MKLDNNDeviceContext*)pool.Get(cpu_place);
-  dev_ctx->ResetBlobMap();
+  dev_ctx->ResetBlobMap(exec_ptr_);
 }
 
 void MKLDNNDeviceContextThreadLocals::Body::set_cur_mkldnn_session_id(
@@ -606,17 +608,34 @@ mkldnn::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) {
   return cur_stream;
 }
 
-void MKLDNNDeviceContext::ResetBlobMap() {
+void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
   std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
   if (!block_next_cache_clearing_) {
     VLOG(3) << "Clearing DNNL cache.";
-    p_blobmap_->clear();
+    // If no specific executor pointer then clear
+    // everything. For executor pointer then clear only
+    // objects allocated when using given executor
+    if (ptr == nullptr) {
+      p_blobmap_->clear();
+    } else {
+      for (auto& v : (*p_exec_items_)[ptr]) {
+        (v.first)->erase(v.second);
+      }
+      p_exec_items_->erase(ptr);
+    }
   } else {
     VLOG(3) << "Prevented Clearing DNNL cache.";
     block_next_cache_clearing_ = false;
   }
 }
 
+void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t<KeyBlob> pblob,
+                                                KeyBlob::iterator it) const {
+  // Take current executor addess from TLS
+  // and for this executor's items add the one defined with arguments
+  (*p_exec_items_)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it));
+}
+
 void MKLDNNDeviceContext::BlockNextCacheClearing() {
   std::lock_guard<decltype(*p_mutex_)> lock(*p_mutex_);
   VLOG(3) << "Next DNNL cache clearing has been blocked.";
@@ -681,7 +700,11 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name,
   // Find Blob via name
   auto blob_it = pBlob->find(name);
   if (blob_it == pBlob->end()) {
-    (*pBlob)[name] = data;
+    auto el =
+        pBlob->insert(std::make_pair(name, data));  //  (*pBlob)[name] = data;
+    // Register new element in per executor map
+    // to have easily erased when executor terminated
+    LinkEntryWithExecutor(pBlob, el.first);
   } else {
     blob_it->second = data;  // set data to existing blob
   }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 2578c9b6cdea5a..d91e14ec3aa923 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -189,19 +189,35 @@ class NPUDeviceContext : public DeviceContext {
   /*! \brief  Return npu stream in the device context. */
   aclrtStream stream() const;
 
-#ifdef PADDLE_WITH_ASCEND_HCCL
-  /*! \brief  Return bkcl context. */
-  HCCLContext_t hccl_context() const { return hccl_context_; }
+  template <typename Callback>
+  void AddStreamCallback(Callback&& callback) const {
+    return stream_->AddCallback(callback);
+  }
 
-  /*! \brief  Set bkcl context. */
-  void set_hccl_context(HCCLContext_t context) { hccl_context_ = context; }
+  void WaitStreamCallback() const { return stream_->WaitCallback(); }
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+  /*! \brief  Return hccl communicators. */
+  HcclComm hccl_comm() const { return hccl_comm_; }
+
+  /*! \brief  Set hccl communicators. */
+  void set_hccl_comm(HcclComm comm) { hccl_comm_ = comm; }
 #endif
 
+  // template <typename Callback>
+  // void AddStreamCallback(Callback&& callback) const {
+  //   return stream_->AddCallback(callback);
+  // }
+
+  // void WaitStreamCallback() const { return stream_->WaitCallback(); }
+
  private:
   NPUPlace place_;
   aclrtContext context_;
-#ifdef PADDLE_WITH_ASCEND_HCCL
-  HCCLContext_t hccl_context_;
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  // HCCLContext_t hccl_context_;
+  HcclComm hccl_comm_{nullptr};
 #endif
 
   // Need to be the same with other DeviceContext,
@@ -657,6 +673,7 @@ class MKLDNNDeviceContextThreadLocals {
     mkldnn::stream cur_stream;
     std::string key_suffix;  // Key identifying current Executor
     bool key_attach_thread_id = true;
+    void* exec_ptr_ = nullptr;
 
     Body();
     ~Body();
@@ -673,6 +690,8 @@ class MKLDNNDeviceContextThreadLocals {
     const std::string& get_key_suffix(void) const { return key_suffix; }
     void disable_tid_in_key(void) { key_attach_thread_id = false; }
     bool is_tid_used_in_key(void) const { return key_attach_thread_id; }
+    void set_curr_exec(void* exec_ptr) { exec_ptr_ = exec_ptr; }
+    void* get_curr_exec(void) const { return exec_ptr_; }
   };
   MKLDNNDeviceContextThreadLocals() = default;
   MKLDNNDeviceContextThreadLocals(const MKLDNNDeviceContextThreadLocals& c) =
@@ -708,13 +727,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
   using ShapeBlob = umap_key_string_t<KeyBlob>;
   using BlobMap = umap_value_smart_t<int, ShapeBlob>;
 
+  using ExecMap = std::unordered_map<
+      void*, std::vector<std::pair<BlobPtr_t<KeyBlob>, KeyBlob::iterator>>>;
+
   explicit MKLDNNDeviceContext(CPUPlace place);
 
   /* \brief  Get the active engine */
   const mkldnn::engine& GetEngine() const { return tls().get_engine(); }
 
+  // Register object to currently used executor's map
+  void LinkEntryWithExecutor(BlobPtr_t<KeyBlob>, KeyBlob::iterator) const;
+
   // Remove all entries from the blob map
-  void ResetBlobMap();
+  void ResetBlobMap(void* ptr);
 
   // Prevent next ResetBlobMap()
   void BlockNextCacheClearing();
@@ -737,6 +762,9 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
 
  private:
   std::shared_ptr<BlobMap> p_blobmap_;
+  // Map key is pointer of executor and value is a data(iterator in map) needed
+  // to erase
+  std::shared_ptr<ExecMap> p_exec_items_;
   std::shared_ptr<std::mutex> p_mutex_;
   bool block_next_cache_clearing_ = false;
 };
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 717b5ce83c6c98..724a9b8483cdee 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -587,6 +587,8 @@ class DeviceTracerImpl : public DeviceTracer {
               BOOST_GET_CONST(platform::CUDAPlace, r.place).GetDeviceId());
         } else if (platform::is_cuda_pinned_place(r.place)) {
           event->set_place(proto::MemEvent::CUDAPinnedPlace);
+        } else if (platform::is_npu_place(r.place)) {
+          event->set_place(proto::MemEvent::NPUPlace);
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "The current place is not supported."));
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index e65a38cd323aaf..8bff2ead0a2a3e 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -1,6 +1,6 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 
-list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc)
+list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc nvjpeg.cc)
 
 if (WITH_ROCM)
   list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc)
@@ -9,7 +9,7 @@ endif()
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux.
 if (NOT APPLE AND NOT WIN32)
-  list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
+    list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
   if (WITH_NCCL)
     list(APPEND CUDA_SRCS nccl.cc)
   endif()
@@ -32,6 +32,8 @@ endif(CUPTI_FOUND)
 if(WITH_ROCM)
   hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
   cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+elseif (WITH_ASCEND_CL)
+  cc_library(dynload_warpctc SRCS warpctc.cc hccl.cc DEPS dynamic_loader warpctc)
 else()
   nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
   cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 956acfe2771c50..be9cda4a2e9b6c 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -36,6 +36,13 @@ DEFINE_string(nccl_dir, "",
               "For instance, /usr/local/cuda/lib64. If default, "
               "dlopen will search cuda from LD_LIBRARY_PATH");
 
+DEFINE_string(hccl_dir, "",
+              "Specify path for loading hccl library, such as libhccl.so. "
+              "For instance, "
+              "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If "
+              "default, "
+              "dlopen will search hccl from LD_LIBRARY_PATH");
+
 DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
 
 DEFINE_string(
@@ -93,6 +100,9 @@ static constexpr char* win_cublas_lib =
 static constexpr char* win_curand_lib =
     "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
     ".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll";
+static constexpr char* win_nvjpeg_lib =
+    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll";
 static constexpr char* win_cusolver_lib =
     "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
     ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll;cusolver64_10.dll";
@@ -100,6 +110,9 @@ static constexpr char* win_cusolver_lib =
 static constexpr char* win_curand_lib =
     "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
     ".dll;curand64_" CUDA_VERSION_MAJOR ".dll";
+static constexpr char* win_nvjpeg_lib =
+    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll";
 static constexpr char* win_cusolver_lib =
     "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
     ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll";
@@ -323,6 +336,17 @@ void* GetCurandDsoHandle() {
 #endif
 }
 
+void* GetNvjpegDsoHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib");
+#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_nvjpeg_lib, true,
+                                    {cuda_lib_path});
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.so");
+#endif
+}
+
 void* GetCusolverDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
@@ -392,6 +416,24 @@ void* GetNCCLDsoHandle() {
                                     warning_msg);
 #endif
 }
+void* GetHCCLDsoHandle() {
+  std::string warning_msg(
+      "You may need to install 'hccl2' from Huawei official website: "
+      "before install PaddlePaddle.");
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
+                                    warning_msg);
+#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
+  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
+
+#elif defined(PADDLE_WITH_ASCEND_CL)
+  return GetDsoHandleFromSearchPath(FLAGS_hccl_dir, "libhccl.so", true, {},
+                                    warning_msg);
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
+                                    warning_msg);
+#endif
+}
 
 void* GetTensorRtDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index c3f5953c785791..9ab6dca0126bcb 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -29,11 +29,13 @@ void* GetCublasDsoHandle();
 void* GetCUDNNDsoHandle();
 void* GetCUPTIDsoHandle();
 void* GetCurandDsoHandle();
+void* GetNvjpegDsoHandle();
 void* GetCusolverDsoHandle();
 void* GetNVRTCDsoHandle();
 void* GetCUDADsoHandle();
 void* GetWarpCTCDsoHandle();
 void* GetNCCLDsoHandle();
+void* GetHCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
 void* GetOpDsoHandle(const std::string& dso_name);
diff --git a/paddle/fluid/platform/dynload/hccl.cc b/paddle/fluid/platform/dynload/hccl.cc
new file mode 100644
index 00000000000000..5efac7691eb98b
--- /dev/null
+++ b/paddle/fluid/platform/dynload/hccl.cc
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+#include "paddle/fluid/platform/dynload/hccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag hccl_dso_flag;
+void *hccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+HCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+#if HCCL_VERSION_CODE >= 2212
+HCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
+#endif
+
+#if HCCL_VERSION_CODE >= 2703
+HCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/dynload/hccl.h b/paddle/fluid/platform/dynload/hccl.h
new file mode 100644
index 00000000000000..a56180ce2d4ca5
--- /dev/null
+++ b/paddle/fluid/platform/dynload/hccl.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+#include <hccl/hccl.h>
+#include <hccl/hccl_types.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+#define HCOM_GROUP_PREFIX "HCOM_GROUP_"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag hccl_dso_flag;
+extern void* hccl_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_HCCL_WRAP(__name)                           \
+  struct DynLoad__##__name {                                             \
+    template <typename... Args>                                          \
+    auto operator()(Args... args) -> decltype(__name(args...)) {         \
+      using HCCL_func = decltype(&::__name);                             \
+      std::call_once(hccl_dso_flag, []() {                               \
+        hccl_dso_handle = paddle::platform::dynload::GetHCCLDsoHandle(); \
+      });                                                                \
+      static void* p_##__name = dlsym(hccl_dso_handle, #__name);         \
+      return reinterpret_cast<HCCL_func>(p_##__name)(args...);           \
+    }                                                                    \
+  };                                                                     \
+  extern DynLoad__##__name __name
+
+#define HCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(HcclReduceScatter);           \
+  __macro(HcclCommDestroy);             \
+  __macro(HcclAllReduce);               \
+  __macro(HcclCommInitRootInfo);        \
+  __macro(HcclGetRootInfo);             \
+  __macro(HcclBroadcast);               \
+  __macro(HcclCommInitClusterInfo);     \
+  __macro(HcclAllGather);
+
+HCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
+
+#if HCCL_VERSION_CODE >= 2212
+#define HCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(HCCLBroadcast);
+HCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
+#endif
+
+#if HCCL_VERSION_CODE >= 2703
+#define HCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
+  __macro(HCCLSend);                               \
+  __macro(HCCLRecv);
+HCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h
index 5ff4bff4bff652..77ff3f3ccbbb6e 100644
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -110,6 +110,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
   __macro(miopenActivationBackward);                      \
   __macro(miopenConvolutionBackwardWeights);              \
   __macro(miopenConvolutionForward);                      \
+  __macro(miopenConvolutionForwardBias);                  \
   __macro(miopenConvolutionBackwardBias);                 \
   __macro(miopenConvolutionForwardGetWorkSpaceSize);      \
   __macro(miopenConvolutionBackwardDataGetWorkSpaceSize); \
diff --git a/paddle/fluid/platform/dynload/nvjpeg.cc b/paddle/fluid/platform/dynload/nvjpeg.cc
new file mode 100644
index 00000000000000..eb0ad78b9b73cd
--- /dev/null
+++ b/paddle/fluid/platform/dynload/nvjpeg.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/nvjpeg.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag nvjpeg_dso_flag;
+void *nvjpeg_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/nvjpeg.h b/paddle/fluid/platform/dynload/nvjpeg.h
new file mode 100644
index 00000000000000..ae457b2958f5de
--- /dev/null
+++ b/paddle/fluid/platform/dynload/nvjpeg.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA
+#include <nvjpeg.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+extern std::once_flag nvjpeg_dso_flag;
+extern void *nvjpeg_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name)                             \
+  struct DynLoad__##__name {                                                 \
+    template <typename... Args>                                              \
+    nvjpegStatus_t operator()(Args... args) {                                \
+      using nvjpegFunc = decltype(&::__name);                                \
+      std::call_once(nvjpeg_dso_flag, []() {                                 \
+        nvjpeg_dso_handle = paddle::platform::dynload::GetNvjpegDsoHandle(); \
+      });                                                                    \
+      static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name);           \
+      return reinterpret_cast<nvjpegFunc>(p_##__name)(args...);              \
+    }                                                                        \
+  };                                                                         \
+  extern DynLoad__##__name __name
+
+#define NVJPEG_RAND_ROUTINE_EACH(__macro) \
+  __macro(nvjpegCreateSimple);            \
+  __macro(nvjpegJpegStateCreate);         \
+  __macro(nvjpegGetImageInfo);            \
+  __macro(nvjpegJpegStateDestroy);        \
+  __macro(nvjpegDecode);
+
+NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc
index e72fbd246cf05d..1d105a1fd86825 100644
--- a/paddle/fluid/platform/dynload/tensorrt.cc
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
@@ -27,7 +27,8 @@ void* tensorrt_plugin_dso_handle;
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
-TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP);
+TENSORRT_RAND_ROUTINE_EACH_POINTER(DEFINE_WRAP);
+TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(DEFINE_WRAP);
 TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
 void* GetDsoHandle(const std::string& dso_name) {
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index e9bea9af9ca6e0..bc29a0472041af 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -37,7 +37,7 @@ void* GetTensorRtPluginHandle();
 extern std::once_flag tensorrt_plugin_dso_flag;
 extern void* tensorrt_plugin_dso_handle;
 
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP(__name)                    \
   struct DynLoad__##__name {                                                  \
     template <typename... Args>                                               \
     void* operator()(Args... args) {                                          \
@@ -55,6 +55,23 @@ extern void* tensorrt_plugin_dso_handle;
   };                                                                          \
   extern DynLoad__##__name __name
 
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name)                \
+  struct DynLoad__##__name {                                                  \
+    template <typename... Args>                                               \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
+      std::call_once(tensorrt_dso_flag, []() {                                \
+        tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \
+      });                                                                     \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);          \
+      PADDLE_ENFORCE_NOT_NULL(p_##__name,                                     \
+                              platform::errors::Unavailable(                  \
+                                  "Load tensorrt api %s failed", #__name));   \
+      using tensorrt_func = decltype(&::__name);                              \
+      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);            \
+    }                                                                         \
+  };                                                                          \
+  extern DynLoad__##__name __name
+
 #define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name)                      \
   struct DynLoad__##__name {                                                   \
     template <typename... Args>                                                \
@@ -76,20 +93,25 @@ extern void* tensorrt_plugin_dso_handle;
 #ifdef NV_TENSORRT_MAJOR
 
 #if (NV_TENSORRT_MAJOR >= 6)
-#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
-  __macro(createInferBuilder_INTERNAL);     \
-  __macro(createInferRuntime_INTERNAL);     \
+#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \
+  __macro(createInferBuilder_INTERNAL);             \
+  __macro(createInferRuntime_INTERNAL);             \
   __macro(getPluginRegistry);
 #else
-#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
-  __macro(createInferBuilder_INTERNAL);     \
+#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \
+  __macro(createInferBuilder_INTERNAL);             \
   __macro(createInferRuntime_INTERNAL);
 #endif
 
+#define TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(__macro) \
+  __macro(getInferLibVersion);
+
 #define TENSORRT_PLUGIN_RAND_ROUTINE_EACH(__macro) \
   __macro(initLibNvInferPlugins);
 
-TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
+TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP)
+TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(
+    DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP)
 TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP)
 
 #endif  // end of NV_TENSORRT_MAJOR
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index f0809d34d493e9..d42733823e669b 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -47,6 +47,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "acl/acl.h"
+#include "hccl/hccl_types.h"
 #endif  // PADDLE_WITH_ASCEND_CL
 
 #include <fstream>
@@ -990,6 +991,16 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
     }                                                            \
   } while (0)
 
+#define PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(OP)                                 \
+  do {                                                                         \
+    auto res = cudaGetLastError();                                             \
+    if (UNLIKELY(res != cudaSuccess)) {                                        \
+      auto msg = ::paddle::platform::build_nvidia_error_msg(res);              \
+      PADDLE_THROW(platform::errors::Fatal("CUDA error after kernel (%s): %s", \
+                                           OP, msg));                          \
+    }                                                                          \
+  } while (0)
+
 inline void retry_sleep(unsigned milliseconds) {
 #ifdef _WIN32
   Sleep(milliseconds);
@@ -1220,6 +1231,7 @@ struct NPUStatusType {};
   }
 
 DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
+DEFINE_NPU_STATUS_TYPE(HcclResult, HCCL_SUCCESS);
 }  // namespace details
 
 inline std::string build_npu_error_msg(aclError stat) {
@@ -1228,6 +1240,12 @@ inline std::string build_npu_error_msg(aclError stat) {
   return sout.str();
 }
 
+inline std::string build_npu_error_msg(HcclResult stat) {
+  std::ostringstream sout;
+  sout << " HCCL error, the error code is : " << stat << ". ";
+  return sout.str();
+}
+
 #define PADDLE_ENFORCE_NPU_SUCCESS(COND)                       \
   do {                                                         \
     auto __cond__ = (COND);                                    \
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 83b9544d23267b..1d76c2ea584b7e 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -578,6 +578,19 @@ DEFINE_string(tracer_mkldnn_ops_on, "",
 DEFINE_string(tracer_mkldnn_ops_off, "",
               "List of OneDNN operation types to be turned off");
 
+/**
+ * Debug related FLAG
+ * Name: check_kernel_launch
+ * Since Version: 2.1.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: Check kernel launch status after every kernel compute.
+ */
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+DEFINE_bool(check_kernel_launch, false,
+            "Check kernel launch status after every kernel compute");
+#endif
+
 /**
  * CUDNN related FLAG
  * Name: conv2d_disable_cudnn
diff --git a/paddle/fluid/platform/hccl_helper.h b/paddle/fluid/platform/hccl_helper.h
new file mode 100644
index 00000000000000..692f8dbe0bf1ee
--- /dev/null
+++ b/paddle/fluid/platform/hccl_helper.h
@@ -0,0 +1,355 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLE_WITH_HCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL)
+
+#include <stdio.h>
+#include <memory>
+#include <string>
+#include <thread>  // NOLINT
+#include <typeindex>
+#include <unordered_map>
+#include <vector>
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/dynload/hccl.h"
+#endif
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+
+#define HCCL_ID_VARNAME "HCCLID"
+
+namespace paddle {
+namespace platform {
+
+inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) {
+  if (type == framework::proto::VarType::FP32) {
+    return HCCL_DATA_TYPE_FP32;
+  } else if (type == framework::proto::VarType::FP16) {
+    return HCCL_DATA_TYPE_FP16;
+  } else if (type == framework::proto::VarType::INT32) {
+    return HCCL_DATA_TYPE_INT32;
+  } else if (type == framework::proto::VarType::INT8) {
+    return HCCL_DATA_TYPE_INT8;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in hccl is not supported."));
+  }
+}
+
+// NOTE(minqiyang): according to the ncclGroupEnd documentations:
+// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
+// ncclGroupEnd will wait for all communicators to be initialized, which will
+// cause blocking problem when a runtime_error was thrown, so try only guard
+// HCCL actions when use it.
+
+// class HCCLGroupGuard {
+//  public:
+//   static std::mutex &HCCLMutex() {
+//     static std::mutex mtx;
+//     return mtx;
+//   }
+
+//   inline HCCLGroupGuard() {
+//     HCCLMutex().lock();
+//     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
+//   }
+
+//   inline ~HCCLGroupGuard() PADDLE_MAY_THROW {
+//     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
+//     HCCLMutex().unlock();
+//   }
+// };
+
+struct HCCLContext {
+  std::unique_ptr<NPUDeviceContext> ctx_;
+  HcclComm comm_;
+
+  explicit HCCLContext(int dev_id)
+      : ctx_(new NPUDeviceContext(NPUPlace(dev_id))), comm_{nullptr} {}
+
+  aclrtStream stream() const { return ctx_->stream(); }
+  HcclComm comm() const { return comm_; }
+
+  int device_id() const {
+    return BOOST_GET_CONST(platform::NPUPlace, ctx_->GetPlace()).device;
+  }
+};
+
+struct HCCLContextMap {
+  std::unordered_map<int, HCCLContext> contexts_;
+  std::vector<int> order_;
+
+  explicit HCCLContextMap(const std::vector<platform::Place> &places,
+                          HcclRootInfo *hccl_id = nullptr,
+                          size_t num_trainers = 1, size_t trainer_id = 0) {
+    PADDLE_ENFORCE_EQ(!places.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The HCCL place should not be empty."));
+    order_.reserve(places.size());
+    for (auto &p : places) {
+      int dev_id = BOOST_GET_CONST(NPUPlace, p).device;
+      order_.emplace_back(dev_id);
+      contexts_.emplace(dev_id, HCCLContext(dev_id));
+    }
+    PADDLE_ENFORCE_EQ(
+        order_.size(), contexts_.size(),
+        platform::errors::Unavailable("HCCL Context Map does not support "
+                                      "contain two or more same device."));
+
+    std::unique_ptr<HcclComm[]> comms(new HcclComm[order_.size()]);
+    // if num_trainers == 1, should create a new nccl id for local comms.
+    if (num_trainers == 1 && hccl_id == nullptr) {
+      // we do not know how to tackle this situation under hccl
+      // std::lock_guard<std::mutex> guard(HCCLGroupGuard::HCCLMutex());
+      // PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::ncclCommInitAll(
+      //     comms.get(), static_cast<int>(order_.size()), order_.data()));
+    } else {
+      PADDLE_ENFORCE_NOT_NULL(hccl_id, platform::errors::InvalidArgument(
+                                           "The HCCL id should not be null."));
+      {
+        int nranks = num_trainers * order_.size();
+        // HCCLGroupGuard gurad;
+        for (size_t i = 0; i < order_.size(); ++i) {
+          int gpu_id = order_[i];
+          int rank;
+          if (order_.size() > 1) {
+            rank = trainer_id * order_.size() + i;
+          } else {
+            rank = trainer_id;
+          }
+          VLOG(1) << "init hccl rank:" << rank << ", nranks:" << nranks
+                  << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
+          aclrtSetDevice(gpu_id);
+          PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommInitRootInfo(
+              nranks, hccl_id, rank, comms.get() + i));
+        }
+      }
+    }
+    int i = 0;
+    for (auto &dev_id : order_) {
+      contexts_.at(dev_id).comm_ = comms[i++];
+    }
+  }
+
+  HCCLContextMap(const HCCLContextMap &other) = delete;
+  HCCLContextMap &operator=(const HCCLContextMap &other) = delete;
+
+  NPUDeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
+
+  NPUDeviceContext *DevCtx(platform::Place p) const {
+    return DevCtx(BOOST_GET_CONST(NPUPlace, p).device);
+  }
+
+  const HCCLContext &at(platform::Place p) const {
+    return this->at(BOOST_GET_CONST(NPUPlace, p).device);
+  }
+
+  const HCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
+
+  void WaitAll() {
+    for (auto &p : contexts_) {
+      p.second.ctx_->Wait();
+    }
+  }
+};
+
+inline std::string GetFlatHCCLVarName(size_t pos) {
+  if (pos == 0) {
+    return HCCL_ID_VARNAME;
+  }
+  return string::Sprintf("%s_%d", HCCL_ID_VARNAME, static_cast<int>(pos));
+}
+
+inline std::string GetHierarchicalExterHCCLVarName(size_t pos) {
+  return string::Sprintf("Hierarchical_exter_%s_%d", HCCL_ID_VARNAME,
+                         static_cast<int>(pos));
+}
+inline std::string GetHierarchicalInterHCCLVarName(size_t pos) {
+  return string::Sprintf("Hierarchical_inter_%s_%d", HCCL_ID_VARNAME,
+                         static_cast<int>(pos));
+}
+
+class HCCLCommunicator {
+ public:
+  HCCLCommunicator() {}
+  virtual ~HCCLCommunicator() PADDLE_MAY_THROW {}
+
+  HCCLContextMap *DefaultFlatCtx() const {
+    if (flat_ctxs_.size() == 0) {
+      return nullptr;
+    }
+
+    return flat_ctxs_[0].get();
+  }
+
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetFlatCtxs() {
+    return &flat_ctxs_;
+  }
+
+  HCCLContextMap *GetFlatCtx(size_t run_order) const {
+    return flat_ctxs_[run_order % flat_ctxs_.size()].get();
+  }
+
+  HCCLContextMap *GetRunEnvHCCLCtx(size_t run_order,
+                                   bool use_hierarchical_allreduce) const {
+    if (!use_hierarchical_allreduce) {
+      return GetFlatCtx(run_order);
+    }
+
+    return GetHierarchicalInterCtx(run_order);
+  }
+
+  /*
+   When nccl inits nccl comm using ncclCommInitAll, it meets error when
+   allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
+   create a new nccl comm for sync_batch_norm_op. And these codes should be
+   polished with a unified nccl management.
+  */
+
+  HCCLContextMap *GetSyncBatchNormCtx(
+      framework::Scope *scope, const std::vector<platform::Place> &places) {
+    auto *hccl_id_var = scope->FindVar(HCCL_ID_VARNAME);
+    if (hccl_id_var != nullptr) {
+      return DefaultFlatCtx();
+    }
+
+    if (sync_batch_norm_ctx_.get() == nullptr) {
+      sync_batch_norm_ctx_.reset(new HCCLContextMap(places));
+    }
+    return sync_batch_norm_ctx_.get();
+  }
+
+  void InitFlatCtxs(const std::vector<platform::Place> &places,
+                    const std::vector<HcclRootInfo *> &hccl_ids,
+                    size_t trainers_num, size_t trainer_id) {
+    if (hccl_ids.size() == 0) {
+      auto ptr = new platform::HCCLContextMap(places);
+      VLOG(1) << "init local trainer";
+      flat_ctxs_.emplace_back(ptr);
+    } else {
+      for (size_t i = 0; i < hccl_ids.size(); i++) {
+        auto ptr = new platform::HCCLContextMap(places, hccl_ids[i],
+                                                trainers_num, trainer_id);
+        VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i;
+        flat_ctxs_.emplace_back(ptr);
+      }
+    }
+
+    // as Executor have no way to use ncclComm created by ParallelExecutor,
+    // we assign all flatten contexts to HCCLCommContext to fix.
+    int nranks = static_cast<int>(trainers_num * places.size());
+    int nrings = static_cast<int>(flat_ctxs_.size());
+    for (int ring_id = 0; ring_id < nrings; ++ring_id) {
+      for (size_t p = 0; p < places.size(); ++p) {
+        int rank = trainer_id * places.size() + p;
+        int dev_id = BOOST_GET_CONST(NPUPlace, places[p]).device;
+        auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id);
+        HCCLCommContext::Instance().AssignHCCLComm(ctx.comm_, nranks, rank,
+                                                   dev_id, ring_id);
+      }
+    }
+  }
+
+  void InitHierarchicalCtxs(const std::vector<platform::Place> &places,
+                            const std::vector<HcclRootInfo *> &inter_hccl_ids,
+                            const std::vector<HcclRootInfo *> &exter_hccl_ids,
+                            size_t trainers_num, size_t trainer_id,
+                            size_t inter_trainers_num,
+                            size_t exter_trainers_num) {
+    PADDLE_ENFORCE_EQ(
+        trainers_num, inter_trainers_num * exter_trainers_num,
+        platform::errors::InvalidArgument(
+            "trainers_num:%llu != inter_trainers_num:%llu * "
+            "exter_trainers_num:%llu",
+            trainers_num, inter_trainers_num, exter_trainers_num));
+
+    PADDLE_ENFORCE_GT(
+        inter_trainers_num, 1,
+        platform::errors::InvalidArgument(
+            "The inter_trainers_num:%llu should be larger than 1.",
+            inter_trainers_num));
+
+    int inter_trainer_id = trainer_id % inter_trainers_num;
+    for (size_t i = 0; i < inter_hccl_ids.size(); i++) {
+      VLOG(1) << "init inter_trainer_id:" << inter_trainer_id
+              << ", comm no:" << i;
+      auto local = new HCCLContextMap(places, inter_hccl_ids[i],
+                                      inter_trainers_num, inter_trainer_id);
+
+      h_inter_ctxs_.emplace_back(local);
+    }
+
+    int exter_trainer_id = -1;
+    if (trainer_id % inter_trainers_num == 0) {
+      exter_trainer_id = trainer_id / inter_trainers_num;
+    }
+
+    if (exter_trainer_id >= 0) {
+      for (size_t i = 0; i < exter_hccl_ids.size(); i++) {
+        auto ex = new HCCLContextMap(places, exter_hccl_ids[i],
+                                     exter_trainers_num, exter_trainer_id);
+        VLOG(1) << "init exter_trainer_id:" << exter_trainer_id
+                << ", comm no:" << i;
+        h_exter_ctxs_.emplace_back(ex);
+      }
+    }
+  }
+
+  bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; }
+
+  HCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const {
+    PADDLE_ENFORCE_GT(h_inter_ctxs_.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Hierarchical ctxs should be initialized firstly!"));
+    return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get();
+  }
+
+  HCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const {
+    PADDLE_ENFORCE_GT(h_exter_ctxs_.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Hierarchical ctxs should be initialized firstly!"));
+    return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get();
+  }
+
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetHierarchicalInterCtxs() {
+    return &h_inter_ctxs_;
+  }
+
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetHierarchicalExterCtxs() {
+    return &h_exter_ctxs_;
+  }
+
+ protected:
+  // Support multi nccl comm on default nccl ring while HCCLContextMap can't.
+  std::vector<std::unique_ptr<HCCLContextMap>> flat_ctxs_;
+
+  // h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce.
+  // And h_exter_ctxs_ can support multi comm too.
+  std::vector<std::unique_ptr<HCCLContextMap>> h_inter_ctxs_;
+  std::vector<std::unique_ptr<HCCLContextMap>> h_exter_ctxs_;
+
+  // just used for sync_batch_norm op.
+  std::unique_ptr<HCCLContextMap> sync_batch_norm_ctx_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index 35776b9f1e6b88..0b683a742c9fd8 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -135,13 +135,14 @@ inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int64_t>& dims,
   return mkldnn::memory::desc({dims}, data_type, format);
 }
 
-inline void ClearMKLDNNCache(const platform::Place& place) {
+inline void ClearMKLDNNCache(const platform::Place& place,
+                             void* ptr = nullptr) {
   // Clear mkl-dnn cache,
   if (platform::is_cpu_place(place)) {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     platform::MKLDNNDeviceContext* dev_ctx =
         (platform::MKLDNNDeviceContext*)pool.Get(place);
-    dev_ctx->ResetBlobMap();
+    dev_ctx->ResetBlobMap(ptr);
     platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout(
         paddle::framework::DataLayout::kNCHW);
   }
@@ -452,6 +453,9 @@ inline void AttachPointerHashToMKLDNNKey(void* ptr,
       paddle::platform::MKLDNNDeviceContext::tls().set_key_suffix(
           "E" + std::to_string(reinterpret_cast<uintptr_t>(ptr)));
     }
+    // Let's register adress of current executor
+    paddle::platform::MKLDNNDeviceContext::tls().set_curr_exec(ptr);
+
     // For first thread
     if (first_thread == ThreadIDasStr()) {
       paddle::platform::MKLDNNDeviceContext::tls().disable_tid_in_key();
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 0c45da63edd70e..e584b849368e41 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -630,6 +630,67 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> {
   }
 };
 
+template <typename T>
+class BroadcastDataMKLDNNHandler
+    : public platform::MKLDNNHandlerT<T, dnnl::binary> {
+ public:
+  BroadcastDataMKLDNNHandler(const dnnl::algorithm algo,
+                             const MKLDNNDeviceContext& dev_ctx,
+                             const mkldnn::engine engine,
+                             platform::Place cpu_place, const Tensor* x,
+                             const Tensor* y, float scale_x, float scale_y,
+                             const std::string& uniq_name,
+                             const std::vector<int64_t>& input_dims)
+      : platform::MKLDNNHandlerT<T, dnnl::binary>(
+            dev_ctx, engine, cpu_place,
+            platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
+                                uniq_name)) {
+    if (!this->isCached()) {
+      PADDLE_ENFORCE_EQ(
+          x->layout(), DataLayout::kMKLDNN,
+          platform::errors::InvalidArgument("Wrong layout set for X tensor."));
+      PADDLE_ENFORCE_NE(
+          x->format(), MKLDNNMemoryFormat::undef,
+          platform::errors::InvalidArgument("Wrong format set for X tensor."));
+
+      PADDLE_ENFORCE_EQ(
+          y->layout(), DataLayout::kMKLDNN,
+          platform::errors::InvalidArgument("Wrong layout set for Y tensor."));
+      PADDLE_ENFORCE_NE(
+          y->format(), MKLDNNMemoryFormat::undef,
+          platform::errors::InvalidArgument("Wrong format set for Y tensor."));
+
+      const auto src0_tz = framework::vectorize(x->dims());
+
+      const auto src0_md = dnnl::memory::desc(
+          src0_tz, platform::MKLDNNGetDataType<T>(), x->format());
+      const auto src1_md = dnnl::memory::desc(
+          input_dims, platform::MKLDNNGetDataType<T>(), x->format());
+
+      dnnl::primitive_attr attributes;
+      attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x});
+      attributes.set_scales(DNNL_ARG_SRC_1, 0, {scale_y});
+
+      this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md,
+                                              src1_md, src0_md);
+    }
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(framework::Tensor* input) {
+    T* input_data = input->data<T>();
+    memset(input_data, 0, this->fwd_pd_->src_desc().get_size());
+    return this->AcquireMemoryFromPrimitive(
+        this->fwd_pd_->src_desc(), to_void_cast<T>(input_data), "@src0_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSecondSrcMemory(
+      const framework::Tensor* input) {
+    const T* input_data = input->data<T>();
+    return this->AcquireMemoryFromPrimitive(
+        this->fwd_pd_->src1_desc(), to_void_cast<T>(input_data), "@src1_mem_p");
+  }
+};
+
 template <typename T>
 class ReductionMKLDNNHandler
     : public platform::MKLDNNHandlerT<T, dnnl::reduction> {
@@ -639,7 +700,7 @@ class ReductionMKLDNNHandler
                          const mkldnn::engine engine, platform::Place cpu_place,
                          const Tensor* x, const Tensor* y,
                          const std::string& uniq_name,
-                         std::vector<int64_t> output_dims)
+                         std::vector<int64_t> y_tz)
       : platform::MKLDNNHandlerT<T, dnnl::reduction>(
             dev_ctx, engine, cpu_place,
             platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
@@ -653,14 +714,14 @@ class ReductionMKLDNNHandler
           x->format(), MKLDNNMemoryFormat::undef,
           platform::errors::InvalidArgument("Wrong format set for X tensor."));
 
-      const auto src_tz = framework::vectorize(x->dims());
+      const auto x_tz = framework::vectorize(x->dims());
 
-      const auto src_md = dnnl::memory::desc(
-          src_tz, platform::MKLDNNGetDataType<T>(), x->format());
-      const auto dst_md = memory::desc(
-          output_dims, platform::MKLDNNGetDataType<T>(), x->format());
+      const auto x_md = dnnl::memory::desc(
+          x_tz, platform::MKLDNNGetDataType<T>(), x->format());
+      const auto y_md =
+          memory::desc(y_tz, platform::MKLDNNGetDataType<T>(), x->format());
 
-      this->AcquireForwardPrimitiveDescriptor(algo, src_md, dst_md, p, eps);
+      this->AcquireForwardPrimitiveDescriptor(algo, x_md, y_md, p, eps);
     }
   }
 };
diff --git a/paddle/fluid/platform/npu_info.cc b/paddle/fluid/platform/npu_info.cc
index 3814faa7662fc5..bb36eedb832381 100644
--- a/paddle/fluid/platform/npu_info.cc
+++ b/paddle/fluid/platform/npu_info.cc
@@ -190,6 +190,8 @@ void NPUMemcpySync(void *dst, const void *src, size_t count,
                    enum aclrtMemcpyKind kind, size_t dst_max_count) {
   // NOTE(zhiqiu):  The default max_count is count
   dst_max_count = dst_max_count ? dst_max_count : count;
+  VLOG(4) << dst << " " << dst_max_count << " " << src << " " << count << " "
+          << kind;
   PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind));
 }
 
diff --git a/paddle/fluid/platform/npu_profiler.h b/paddle/fluid/platform/npu_profiler.h
new file mode 100644
index 00000000000000..a7b674d0d0c3fe
--- /dev/null
+++ b/paddle/fluid/platform/npu_profiler.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "acl/acl_prof.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+#ifdef PADDLE_WITH_ASCEND_STRING
+// For CANN 20.2+
+// ACL_AICORE_ARITHMETIC_UTILIZATION = 0, record arithmetic stats
+// ACL_AICORE_PIPE_UTILIZATION = 1, record pipeline
+// ACL_AICORE_MEMORY_BANDWIDTH = 2, record memory
+// ACL_AICORE_L0B_AND_WIDTH = 3, recore internal memory
+// ACL_AICORE_RESOURCE_CONFLICT_RATIO = 5, record pipeline ratio
+constexpr aclprofAicoreMetrics default_metrics =
+    ACL_AICORE_ARITHMETIC_UTILIZATION;
+#else
+// For CANN 20.1
+// ACL_AICORE_ARITHMATIC_THROUGHPUT = 0, record arithmetic stats
+// ACL_AICORE_PIPELINE = 1, record pipeline
+// ACL_AICORE_SYNCHRONIZATION = 2, record sync
+// ACL_AICORE_MEMORY = 3, recore memory
+// ACL_AICORE_INTERNAL_MEMORY = 4, recore internal memory
+// ACL_AICORE_STALL = 5, record pipeline ratio
+constexpr aclprofAicoreMetrics default_metrics =
+    ACL_AICORE_ARITHMATIC_THROUGHPUT;
+#endif
+
+// ACL_PROF_ACL_API, record ACL API stats
+// ACL_PROF_TASK_TIME, record AI core stats
+// ACL_PROF_AICORE_METRICS, must include
+// ACL_PROF_AICPU_TRACE, recore AICPU, not supported yet
+constexpr uint64_t default_type =
+    ACL_PROF_ACL_API | ACL_PROF_AICORE_METRICS | ACL_PROF_TASK_TIME;
+
+aclprofConfig *NPUProfilerCreateConfig(
+    std::vector<uint32_t> devices = {},
+    aclprofAicoreMetrics metrics = default_metrics, uint64_t c = default_type,
+    aclprofAicoreEvents *events = nullptr) {
+  if (devices.size() == 0) {
+    int device_id = GetCurrentNPUDeviceId();
+    devices.emplace_back(device_id);
+  }
+  aclprofConfig *config =
+      aclprofCreateConfig(devices.data(), devices.size(), metrics, events, c);
+  PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::External(
+                                      "Failed to create prof config for NPU"));
+  return config;
+}
+
+void NPUProfilerDestroyConfig(const aclprofConfig *config) {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclprofDestroyConfig(config));
+}
+
+void NPUProfilerInit(std::string output_path) {
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclprofInit(output_path.c_str(), output_path.size()));
+}
+
+void NPUProfilerStart(const aclprofConfig *config) {
+  if (config == nullptr) {
+    // NOTE(zhiqiu): support single device by default.
+    int device_id = GetCurrentNPUDeviceId();
+    std::vector<uint32_t> devices = {static_cast<uint32_t>(device_id)};
+    config = NPUProfilerCreateConfig(devices);
+  }
+  PADDLE_ENFORCE_NPU_SUCCESS(aclprofStart(config));
+}
+
+void NPUProfilerStop(const aclprofConfig *config) {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclprofStop(config));
+  NPUProfilerDestroyConfig(config);
+}
+
+void NPUProfilerFinalize() { PADDLE_ENFORCE_NPU_SUCCESS(aclprofFinalize()); }
+
+struct NPUProfConfigWrapper {
+  aclprofConfig *p_;
+  explicit NPUProfConfigWrapper(aclprofConfig *p) : p_(p) {}
+  aclprofConfig *ptr() { return p_; }
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/npu_resource_pool.cc b/paddle/fluid/platform/npu_resource_pool.cc
new file mode 100644
index 00000000000000..22b9e8f03971e5
--- /dev/null
+++ b/paddle/fluid/platform/npu_resource_pool.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/npu_resource_pool.h"
+#include "paddle/fluid/platform/npu_info.h"
+
+namespace paddle {
+namespace platform {
+
+NpuStreamResourcePool::NpuStreamResourcePool() {
+  int dev_cnt = platform::GetNPUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::SetNPUDeviceId(dev_idx);
+      aclrtStream stream;
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateStream(&stream));
+      return stream;
+    };
+
+    auto deleter = [dev_idx](aclrtStream stream) {
+      platform::SetNPUDeviceId(dev_idx);
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyStream(stream));
+    };
+
+    pool_.emplace_back(ResourcePool<NpuStreamObject>::Create(creator, deleter));
+  }
+}
+
+NpuStreamResourcePool& NpuStreamResourcePool::Instance() {
+  static NpuStreamResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<NpuStreamObject> NpuStreamResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+NpuEventResourcePool::NpuEventResourcePool() {
+  int dev_cnt = platform::GetNPUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::SetNPUDeviceId(dev_idx);
+      aclrtEvent event;
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(&event));
+      return event;
+    };
+
+    auto deleter = [dev_idx](aclrtEvent event) {
+      platform::SetNPUDeviceId(dev_idx);
+      PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event));
+    };
+
+    pool_.emplace_back(ResourcePool<NpuEventObject>::Create(creator, deleter));
+  }
+}
+
+NpuEventResourcePool& NpuEventResourcePool::Instance() {
+  static NpuEventResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<NpuEventObject> NpuEventResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/npu_resource_pool.h b/paddle/fluid/platform/npu_resource_pool.h
new file mode 100644
index 00000000000000..bfd6ec7f941120
--- /dev/null
+++ b/paddle/fluid/platform/npu_resource_pool.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "acl/acl.h"
+#include "paddle/fluid/platform/resource_pool.h"
+
+namespace paddle {
+namespace platform {
+
+using NpuStreamObject = std::remove_pointer<aclrtStream>::type;
+using NpuEventObject = std::remove_pointer<aclrtEvent>::type;
+
+class NpuStreamResourcePool {
+ public:
+  std::shared_ptr<NpuStreamObject> New(int dev_idx);
+
+  static NpuStreamResourcePool &Instance();
+
+ private:
+  NpuStreamResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(NpuStreamResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<NpuStreamObject>>> pool_;
+};
+
+class NpuEventResourcePool {
+ public:
+  std::shared_ptr<NpuEventObject> New(int dev_idx);
+
+  static NpuEventResourcePool &Instance();
+
+ private:
+  NpuEventResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(NpuEventResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<NpuEventObject>>> pool_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto
index cfa3c6906f83f7..31193534a00be0 100644
--- a/paddle/fluid/platform/profiler.proto
+++ b/paddle/fluid/platform/profiler.proto
@@ -21,6 +21,7 @@ message Event {
   enum EventType {
     CPU = 0;
     GPUKernel = 1;
+    NPUKernel = 2;
   }
   optional EventType type = 8;
   optional string name = 1;
@@ -39,6 +40,8 @@ message MemEvent {
     CUDAPlace = 0;
     CPUPlace = 1;
     CUDAPinnedPlace = 2;
+    XPUPlace = 3;
+    NPUPlace = 4;
   }
   optional uint64 start_ns = 1;
   optional uint64 end_ns = 2;
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 287c8fc37e005a..9f4ec9b3ce0d44 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -71,6 +71,8 @@ void StreamCallbackManager<Stream>::AddCallback(
 #endif
 
 #if PADDLE_WITH_ASCEND_CL
+  VLOG(3) << "aclrtLaunchCallback at stream: " << stream_;
+  // TODO(zhiqiu): failed to call aclrtLaunchCallback
   PADDLE_ENFORCE_NPU_SUCCESS(aclrtLaunchCallback(StreamCallbackFunc, func,
                                                  ACL_CALLBACK_BLOCK, stream_));
 #endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index b43ad592a3a253..b30214e1d83559 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -61,7 +61,7 @@ set(PYBIND_SRCS
 if(WITH_ASCEND)
   set(PYBIND_DEPS ${PYBIND_DEPS} ascend_wrapper)
   set(PYBIND_SRCS ${PYBIND_SRCS} ascend_wrapper_py.cc)
-endif(WITH_ASCEND)
+endif()
 
 if(WITH_GLOO)
   set(PYBIND_DEPS ${PYBIND_DEPS} gloo_context)
@@ -86,7 +86,11 @@ endif()
 
 if(WITH_PYTHON)
   # generate op pybind functions automatically for dygraph.
-  set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
+  if (WITH_ASCEND_CL)
+    set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag ascend_wrapper)
+  else()
+    set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
+  endif()
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB})
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS})
 
@@ -100,6 +104,7 @@ if(WITH_PYTHON)
 
   add_executable(op_function_generator op_function_generator.cc)
   target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
+
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(op_function_generator ${os_dependency_modules})
   if(WITH_ROCM)
@@ -153,9 +158,9 @@ if(WITH_PYTHON)
           )
     endif()
   else(WIN32)
-    # If there are no *.so in /usr/lib or LD_LIBRARY_PATH, 
+    # If there are no *.so in /usr/lib or LD_LIBRARY_PATH,
     # copy these *.so to current directory and append current directory to
-    # LD_LIBRARY_PATH. This is different with Windows platformm, which search 
+    # LD_LIBRARY_PATH. This is different with Windows platformm, which search
     # *.dll in current directory automatically.
     add_custom_command(TARGET op_function_generator
           POST_BUILD
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.cc b/paddle/fluid/pybind/ascend_wrapper_py.cc
index 303ab5c0fe8ca4..43725f7dc0f73e 100644
--- a/paddle/fluid/pybind/ascend_wrapper_py.cc
+++ b/paddle/fluid/pybind/ascend_wrapper_py.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include <fcntl.h>
 
 #ifdef _POSIX_C_SOURCE
@@ -108,12 +108,14 @@ enum AttrType {
   AT_NAMEATTR
 };
 
+#ifdef PADDLE_WITH_ASCEND
 void BindAscendDevice(py::module *m) {
   py::class_<platform::ascend::NPUDevice>(*m, "NPUDevice")
       .def_static(
           "get_device_count",
           static_cast<int (*)()>(&platform::ascend::NPUDevice::GetDeviceCount));
 }
+#endif
 
 void BindAscendGraph(py::module *m) {
   m->def("ge_initialize", &ge_initialize, "GEInitialize");
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.h b/paddle/fluid/pybind/ascend_wrapper_py.h
index e999080544c31b..15fb056c90e020 100644
--- a/paddle/fluid/pybind/ascend_wrapper_py.h
+++ b/paddle/fluid/pybind/ascend_wrapper_py.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
index bc8d1e5b40585d..4824a34e843bb1 100644
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -41,6 +41,7 @@ DECLARE_int32(multiple_of_cupti_buffer_size);
 DECLARE_bool(reader_queue_speed_test_mode);
 DECLARE_int32(call_stack_level);
 DECLARE_bool(sort_sum_gradient);
+DECLARE_bool(check_kernel_launch);
 // device management
 DECLARE_int32(paddle_num_threads);
 // executor
@@ -376,7 +377,7 @@ static void RegisterGlobalVarGetterSetter() {
       FLAGS_fraction_of_gpu_memory_to_use, FLAGS_initial_gpu_memory_in_mb,
       FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math,
       FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce,
-      FLAGS_conv2d_disable_cudnn);
+      FLAGS_conv2d_disable_cudnn, FLAGS_check_kernel_launch);
 #endif
 #ifdef PADDLE_WITH_XPU
   REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus);
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 0817dc33671621..450c992d41118a 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -718,7 +718,8 @@ void BindImperative(py::module *m_ptr) {
                {
                  // Release gil and do tracing
                  py::gil_scoped_release release;
-                 tracer->TraceOp("set_value", ins, outs, std::move(attrs));
+                 tracer->TraceOp("set_value", ins, outs, std::move(attrs),
+                                 {{"Input", "Out"}});
                }
              } else {
                auto self_numpy = TensorToPyArray(*self_tensor);
@@ -745,7 +746,7 @@ void BindImperative(py::module *m_ptr) {
              // inplace operator for the VarBase self.
              self->BumpInplaceVersion();
            })
-      .def("__getitem__",
+      .def("_getitem_index_not_tensor",
            [](std::shared_ptr<imperative::VarBase> &self, py::handle _index) {
              std::vector<int> slice_axes, slice_starts, slice_ends,
                  slice_strides, decrease_axis, infer_flags;
@@ -783,6 +784,70 @@ void BindImperative(py::module *m_ptr) {
                return out;
              }
            })
+      .def(
+          "_getitem_from_offset",
+          [](std::shared_ptr<imperative::VarBase> &self, const py::args &args) {
+            const auto &tensor = self->Var().Get<framework::LoDTensor>();
+            PADDLE_ENFORCE_EQ(
+                tensor.IsInitialized(), true,
+                platform::errors::InvalidArgument(
+                    "Tensor of %s is Empty, please check if it has no data.",
+                    self->Name()));
+
+            const auto &tensor_dims = tensor.dims();
+
+            std::vector<size_t> dims(tensor_dims.size());
+            std::vector<size_t> strides(tensor_dims.size());
+
+            size_t numel = 1;
+            for (int i = tensor_dims.size() - 1; i >= 0; --i) {
+              strides[i] = numel;
+              dims[i] = static_cast<size_t>(tensor_dims[i]);
+              numel *= dims[i];
+            }
+            size_t offset = 0;
+            if (args.empty()) {
+              PADDLE_ENFORCE_EQ(
+                  numel, 1,
+                  platform::errors::InvalidArgument(
+                      "only one element tensors can be converted to Python "
+                      "scalars when no input coordinates"));
+            } else if (args.size() == 1) {
+              offset = args[0].cast<size_t>();
+              PADDLE_ENFORCE_LT(
+                  offset, numel,
+                  platform::errors::InvalidArgument(
+                      "index %d is out of bounds for size %d", offset, numel));
+            } else {
+              PADDLE_ENFORCE_EQ(args.size(), dims.size(),
+                                platform::errors::InvalidArgument(
+                                    "incorrect number of indices for Tensor"));
+
+              for (size_t i = 0; i < args.size(); ++i) {
+                size_t index = args[i].cast<size_t>();
+                PADDLE_ENFORCE_LT(
+                    index, dims[i],
+                    platform::errors::InvalidArgument(
+                        "index %d is out fo bounds for axis %d with size %d",
+                        index, i, dims[i]));
+                offset += index * strides[i];
+              }
+            }
+#define TENSOR_TO_PY_SCALAR(T, proto_type)                                   \
+  if (tensor.type() == proto_type) {                                         \
+    std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(proto_type); \
+    T b = TensorGetElement<T>(tensor, offset);                               \
+    return py::array(py::dtype(py_dtype_str.c_str()), {}, {},                \
+                     static_cast<void *>(&b));                               \
+  }
+
+            _ForEachDataType_(TENSOR_TO_PY_SCALAR);
+#undef TENSOR_TO_PY_SCALAR
+            PADDLE_THROW(platform::errors::Unimplemented(
+                "Unsupported tensor data type: %s",
+                framework::DataTypeToString(tensor.type())));
+          },
+          py::return_value_policy::copy)
       .def("_inplace_version",
            [](imperative::VarBase &self) -> uint32_t {
              const auto &var = self.MutableVar();
@@ -1487,7 +1552,7 @@ void BindImperative(py::module *m_ptr) {
                  allow_ops);
              imperative::AmpOperators::Instance().GetMutableBlockOps()->swap(
                  block_ops);
-             VLOG(4) << "AMP operators changed, "
+             VLOG(5) << "AMP operators changed, "
                      << imperative::AmpOperators::Instance();
            })
       .def("_get_amp_op_list",
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index dd9cb65142a3de..8a5ad5852aedf5 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -467,7 +467,10 @@ void BindAnalysisConfig(py::module *m) {
       .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
            py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
       .def("enable_xpu", &AnalysisConfig::EnableXpu,
-           py::arg("l3_workspace_size"))
+           py::arg("l3_workspace_size") = 16 * 1024 * 1024,
+           py::arg("locked") = false, py::arg("autotune") = true,
+           py::arg("autotune_file") = "", py::arg("precision") = "int16",
+           py::arg("adaptive_seqlen") = false)
       .def("disable_gpu", &AnalysisConfig::DisableGpu)
       .def("use_gpu", &AnalysisConfig::use_gpu)
       .def("use_xpu", &AnalysisConfig::use_xpu)
@@ -512,6 +515,8 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("dla_core") = 0)
       .def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled)
       .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
+      .def("enable_dlnne", &AnalysisConfig::EnableDlnne,
+           py::arg("min_subgraph_size") = 3)
       .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine,
            py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
            py::arg("zero_copy") = false,
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 2c1927f49f6b70..bf3c77843219c7 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -26,7 +26,7 @@
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/pybind/pybind.h"
 #include "paddle/fluid/string/string_helper.h"
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
 #endif
 
@@ -123,14 +123,11 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
     {"accuracy", {"Correct", "Total"}},
     {"fill_constant", {"Out"}},
+    {"recv_v2", {"Out"}},
     {"matmul", {"Out"}},
     {"c_broadcast", {"Out"}},
     {"c_sync_calc_stream", {"Out"}},
     {"c_sync_comm_stream", {"Out"}},
-    {"c_allreduce_sum", {"Out"}},
-    {"c_allreduce_max", {"Out"}},
-    {"c_allreduce_min", {"Out"}},
-    {"c_allreduce_prod", {"Out"}},
     {"c_reduce_sum", {"Out"}},
     {"c_reduce_max", {"Out"}},
     {"c_reduce_min", {"Out"}},
@@ -182,16 +179,16 @@ const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableO
 const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})";
 const char* INPUT_LIST_INITIALIZER_TEMPLATE = R"({"%s", %s})";
 
-const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(	
-    if (%s != nullptr) {	
-      ins["%s"] = {%s};	
-    }	
+const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(
+    if (%s != nullptr) {
+      ins["%s"] = {%s};
+    }
 )";
 
-const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(	
+const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(
     if (%s.size() != 0) {
-      ins["%s"] = %s;	
-    }	
+      ins["%s"] = %s;
+    }
 )";
 
 const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(
@@ -264,8 +261,8 @@ R"(
     imperative::NameVarBaseMap ins = %s;
     %s
     tracer->TraceOp("%s", ins, outs, attrs, {%s});
-    return %s; 
-  }   
+    return %s;
+  }
 })";
 
 const char* PYBIND_ITEM_TEMPLATE = R"(  %s.def("%s", &%s);)";
@@ -350,7 +347,7 @@ std::string GenerateOpFunctionsBody(
   }
   ins_initializer += "}";
 
-  if (input_args.back() == ',') {
+  if (!input_args.empty() && input_args.back() == ',') {
     input_args.pop_back();
   }
 
@@ -364,6 +361,7 @@ std::string GenerateOpFunctionsBody(
   int outs_num = 0;
   for (auto& output : op_proto->outputs()) {
     auto& out_name = output.name();
+
     // skip those dispensable oututs
     if (output.dispensable() && !FindOutsMap(op_type, out_name)) {
       continue;
@@ -459,7 +457,7 @@ std::string GenerateOpFunctionsBody(
     return_str.pop_back();
   }
   outs_initializer += "}";
-  if (inplace_mapping_str.back() == ',') {
+  if (!inplace_mapping_str.empty() && inplace_mapping_str.back() == ',') {
     inplace_mapping_str.pop_back();
   }
   if (!use_inplace_strategy && FindViewOpMap(op_type)) {
@@ -567,7 +565,7 @@ int main(int argc, char* argv[]) {
     return -1;
   }
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
   auto ascend_ptr = paddle::framework::AscendInstance::GetInstance();
   ascend_ptr->InitGEForUT();
 #endif
@@ -602,8 +600,9 @@ int main(int argc, char* argv[]) {
 
   out.close();
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
   ge::GEFinalize();
 #endif
+
   return 0;
 }
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 06b3f10fefafa8..6fa49a85423c58 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -235,6 +235,7 @@ void BindOpDesc(pybind11::module *m) {
               const std::vector<std::string> &vec_var_name) {
              self.SetOutput(name, vec_var_name);
            })
+      .def("remove_output", &pd::OpDesc::RemoveOutput)
       .def("input_arg_names", &pd::OpDesc::InputArgumentNames)
       .def("output_arg_names", &pd::OpDesc::OutputArgumentNames)
       .def("_rename_input", &pd::OpDesc::RenameInput)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 39a78d86976ae9..560d8c892b09f9 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -109,6 +109,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/npu_info.h"
+#include "paddle/fluid/platform/npu_profiler.h"
 #endif
 
 #ifdef PADDLE_WITH_XPU
@@ -267,11 +268,6 @@ bool IsCompiledWithBrpc() {
 #ifndef PADDLE_WITH_DISTRIBUTE
   return false;
 #endif
-
-#ifdef PADDLE_WITH_GRPC
-  return false;
-#endif
-
   return true;
 }
 
@@ -500,7 +496,56 @@ PYBIND11_MODULE(core_noavx, m) {
 #endif
     return tensor;
   });
-
+  m.def("_save_lod_tensor", [](const LoDTensor &tensor,
+                               const std::string &str_file_name) {
+    std::ofstream fout(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(static_cast<bool>(fout), true,
+                      platform::errors::Unavailable(
+                          "Cannot open %s to save variables.", str_file_name));
+    SerializeToStream(fout, tensor);
+
+    int64_t tellp = fout.tellp();
+    fout.close();
+    return tellp;
+  });
+  m.def("_load_lod_tensor", [](LoDTensor &tensor,
+                               const std::string &str_file_name) {
+    std::ifstream fin(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(static_cast<bool>(fin), true,
+                      platform::errors::Unavailable(
+                          "Cannot open %s to load variables.", str_file_name));
+
+    DeserializeFromStream(fin, &tensor);
+    int64_t tellg = fin.tellg();
+    fin.close();
+    return tellg;
+  });
+  m.def("_save_selected_rows", [](const SelectedRows &selected_rows,
+                                  const std::string &str_file_name) {
+    std::ofstream fout(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(
+        static_cast<bool>(fout), true,
+        platform::errors::Unavailable("Cannot open %s to save SelectedRows.",
+                                      str_file_name));
+
+    SerializeToStream(fout, selected_rows);
+    int64_t tellp = fout.tellp();
+    fout.close();
+    return tellp;
+  });
+  m.def("_load_selected_rows",
+        [](SelectedRows &selected_rows, const std::string &str_file_name) {
+          std::ifstream fin(str_file_name, std::ios::binary);
+          PADDLE_ENFORCE_EQ(
+              static_cast<bool>(fin), true,
+              platform::errors::Unavailable(
+                  "Cannot open %s to load SelectedRows.", str_file_name));
+
+          DeserializeFromStream(fin, &selected_rows);
+          int64_t tellg = fin.tellg();
+          fin.close();
+          return tellg;
+        });
   m.def("_save_static_dict",
         [](const std::string &str_file_name, const py::handle &vec_var_list,
            const Scope &scope) {
@@ -581,11 +626,6 @@ PYBIND11_MODULE(core_noavx, m) {
         make_ddim(x_dim), make_ddim(y_dim), -1));
   });
 
-#ifdef PADDLE_WITH_ASCEND_CL
-  m.def("_npu_finalize",
-        []() { platform::AclInstance::Instance().Finalize(); });
-#endif
-
   m.def(
       "_append_python_callable_object_and_return_id",
       [](py::object py_obj) -> size_t {
@@ -1744,7 +1784,7 @@ All parameter, weight, gradient are variables in Paddle.
                  "Cannot use NPU because you have installed CPU/GPU version "
                  "PaddlePaddle.\n"
                  "If you want to use NPU, please try to install NPU version "
-                 "PaddlePaddle by: pip install paddlepaddle-xpu\n"
+                 "PaddlePaddle by: pip install paddlepaddle-npu\n"
                  "If you only have CPU, please change NPUPlace(%d) to be "
                  "CPUPlace().\n",
                  dev_id);
@@ -2180,6 +2220,29 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+  m.def("get_npu_device_count", platform::GetNPUDeviceCount);
+  m.def("npu_finalize", []() { platform::AclInstance::Instance().Finalize(); });
+
+  py::class_<platform::NPUProfConfigWrapper>(m, "NPUProfConfigWrapper");
+
+  m.def("npu_prof_init", platform::NPUProfilerInit);
+  m.def("npu_prof_start", [](platform::NPUProfConfigWrapper c) {
+    platform::NPUProfilerStart(c.ptr());
+  });
+  m.def("npu_prof_stop", [](platform::NPUProfConfigWrapper c) {
+    platform::NPUProfilerStop(c.ptr());
+  });
+  m.def("npu_prof_finalize", platform::NPUProfilerFinalize);
+  m.def("npu_prof_create_config", []() {
+    return platform::NPUProfConfigWrapper(platform::NPUProfilerCreateConfig());
+  });
+
+  m.def("npu_prof_destropy_config", [](platform::NPUProfConfigWrapper c) {
+    platform::NPUProfilerDestroyConfig(c.ptr());
+  });
+#endif
+
   py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
       .value("kDefault", platform::TracerOption::kDefault)
       .value("kOpDetail", platform::TracerOption::kOpDetail)
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index ab1dd8a180b5b6..416361d06a996e 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -663,6 +663,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
   }
   bool is_gpu_tensor = platform::is_gpu_place(tensor.place());
   bool is_xpu_tensor = platform::is_xpu_place(tensor.place());
+  bool is_npu_tensor = platform::is_npu_place(tensor.place());
   const auto &tensor_dims = tensor.dims();
   auto tensor_dtype = tensor.type();
   size_t sizeof_dtype = framework::SizeOfType(tensor_dtype);
@@ -681,7 +682,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
 
   std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type());
 
-  if (!is_gpu_tensor && !is_xpu_tensor) {
+  if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor) {
     if (!need_deep_copy) {
       auto base = py::cast(std::move(tensor));
       return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides,
@@ -749,6 +750,34 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
     PADDLE_THROW(platform::errors::PermissionDenied(
         "Cannot use CUDAPlace in CPU only version, "
         "Please recompile or reinstall Paddle with CUDA support."));
+#endif
+  } else if (is_npu_tensor) {
+#ifdef PADDLE_WITH_ASCEND_CL
+    py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
+    PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
+                      platform::errors::InvalidArgument(
+                          "PyArray is not writable, in which case memory leak "
+                          "or double free would occur"));
+    PADDLE_ENFORCE_EQ(
+        py_arr.owndata(), true,
+        platform::errors::InvalidArgument(
+            "PyArray does not own data, in which case  memory leak "
+            "or double free would occur"));
+
+    size_t copy_bytes = sizeof_dtype * numel;
+    auto p = BOOST_GET_CONST(platform::NPUPlace, tensor.place());
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &ctx = *pool.Get(tensor.place());
+    paddle::memory::Copy(
+        platform::CPUPlace(), py_arr.mutable_data(), p, tensor_buf_ptr,
+        copy_bytes,
+        reinterpret_cast<const platform::NPUDeviceContext &>(ctx).stream());
+    ctx.Wait();
+    return py_arr;
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Cannot use NPUPlace in CPU/GPU/XPU version, "
+        "Please recompile or reinstall Paddle with NPU support."));
 #endif
   }
   PADDLE_THROW(platform::errors::Unimplemented("Place is not supported"));
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 20c8794ba634c7..e53828ff10be60 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -26,22 +26,42 @@ set cache_dir=%work_dir:Paddle=cache%
 if not exist %cache_dir%\tools (
     git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
 )
-taskkill /f /im op_function_generator.exe
-wmic process where name="op_function_generator.exe" call terminate
+taskkill /f /im op_function_generator.exe  2>NUL
+taskkill /f /im cmake.exe  2>NUL
+taskkill /f /im MSBuild.exe 2>NUL
+taskkill /f /im CL.exe 2>NUL
+taskkill /f /im Lib.exe 2>NUL
+taskkill /f /im link.exe 2>NUL
+taskkill /f /im vctip.exe 2>NUL
+taskkill /f /im cvtres.exe 2>NUL
+taskkill /f /im rc.exe 2>NUL
+taskkill /f /im mspdbsrv.exe 2>NUL
+taskkill /f /im csc.exe 2>NUL
 taskkill /f /im python.exe  2>NUL
-
+taskkill /f /im nvcc.exe 2>NUL
+taskkill /f /im cicc.exe 2>NUL
+taskkill /f /im ptxas.exe 2>NUL
+taskkill /f /im test_api_impl.exe 2>NUL
+taskkill /f /im op_function_generator.exe 2>NUL
+wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="test_api_impl.exe" call terminate 2>NUL
+wmic process where name="cvtres.exe" call terminate 2>NUL
+wmic process where name="rc.exe" call terminate 2>NUL
+wmic process where name="CL.exe" call terminate 2>NUL
+wmic process where name="Lib.exe" call terminate 2>NUL
+wmic process where name="python.exe" call terminate 2>NUL
 
 rem ------initialize common variable------
 if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64"
 if not defined BRANCH set BRANCH=develop
-if not defined WITH_TENSORRT set WITH_TENSORRT=ON 
+if not defined WITH_TENSORRT set WITH_TENSORRT=ON
 if not defined TENSORRT_ROOT set TENSORRT_ROOT=D:/TensorRT
 if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto
 if not defined WITH_GPU set WITH_GPU=ON
 if not defined WITH_MKL set WITH_MKL=ON
 if not defined WITH_AVX set WITH_AVX=ON
 if not defined WITH_TESTING set WITH_TESTING=ON
-if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=OFF
+if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=ON
 if not defined WITH_PYTHON set WITH_PYTHON=ON
 if not defined ON_INFER set ON_INFER=ON
 if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
@@ -54,6 +74,8 @@ if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=%cache_
 if not defined LOG_LEVEL set LOG_LEVEL=normal
 if not defined PRECISION_TEST set PRECISION_TEST=OFF
 if not defined NIGHTLY_MODE set PRECISION_TEST=OFF
+if not defined retry_times set retry_times=2
+if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
 
 rem -------set cache build directory-----------
 rmdir build\python /s/q
@@ -62,9 +84,6 @@ rmdir build\paddle_inference_install_dir /s/q
 rmdir build\paddle_inference_c_install_dir /s/q
 del build\CMakeCache.txt
 
-: set CI_SKIP_CPP_TEST if only *.py changed
-git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON
-
 if "%WITH_CACHE%"=="OFF" (
     rmdir build /s/q
     goto :mkbuild
@@ -114,72 +133,21 @@ dir .
 dir %cache_dir%
 dir paddle\fluid\pybind\Release
 
-rem ------initialize the python environment------
-if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
-set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
-set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
-
-rem ToDo: virtual environment can't be deleted safely, some process not exit when task is canceled
-rem Now use system python environment temporarily
-rem %PYTHON_EXECUTABLE% -m pip install virtualenv
-rem %PYTHON_EXECUTABLE% -m virtualenv paddle_winci
-rem call paddle_winci\Scripts\activate.bat
-
-rem ------pre install python requirement----------
-where python
-where pip
-pip install wheel --user
-pip install -r %work_dir%\python\unittest_py\requirements.txt --user
-pip install -r %work_dir%\python\requirements.txt --user
-
-if %ERRORLEVEL% NEQ 0 (
-    echo pip install requirements.txt failed!
-    exit /b 7
-)
-
-rem ------pre install clcache and init config----------
-rem pip install clcache --user
-pip uninstall -y clcache
-:: set USE_CLCACHE to enable clcache
-rem set USE_CLCACHE=1
-:: In some scenarios, CLCACHE_HARDLINK can save one file copy.
-rem set CLCACHE_HARDLINK=1
-:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
-rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
-:: set maximum cache size to 20G
-rem clcache.exe -M 21474836480
-
-:: install ninja if GENERATOR is Ninja
-if %GENERATOR% == "Ninja" (
-    pip install ninja
-    if %errorlevel% NEQ 0 (
-        echo pip install ninja failed!
-        exit /b 7
-    )
-)
-
-rem ------show summary of current environment----------
-cmake --version
-if "%WITH_GPU%"=="ON" (
-    nvcc --version
-    where nvidia-smi
-    nvidia-smi
-)
-python %work_dir%\tools\summary_env.py
-%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\get_cpu_info.sh
-
 goto :CASE_%1
 
 echo "Usage: paddle_build.bat [OPTION]"
 echo "OPTION:"
 echo "wincheck_mkl: run Windows MKL/GPU/UnitTest CI tasks on Windows"
 echo "wincheck_openbals: run Windows OPENBLAS/CPU CI tasks on Windows"
+echo "build_avx_whl: build Windows avx whl package on Windows"
+echo "build_no_avx_whl: build Windows no avx whl package on Windows"
 exit /b 1
 
 rem ------PR CI windows check for MKL/GPU----------
 :CASE_wincheck_mkl
 set WITH_MKL=ON
 set WITH_GPU=ON
+set WITH_AVX=ON
 set MSVC_STATIC_CRT=OFF
 
 call :cmake || goto cmake_error
@@ -192,9 +160,11 @@ goto:success
 
 rem ------PR CI windows check for OPENBLAS/CPU------
 :CASE_wincheck_openblas
-set WITH_MKL=ON
+set WITH_MKL=OFF
 set WITH_GPU=OFF
+set WITH_AVX=OFF
 set MSVC_STATIC_CRT=ON
+set retry_times=1
 
 call :cmake || goto cmake_error
 call :build || goto build_error
@@ -209,6 +179,7 @@ rem ------Build windows avx whl package------
 set WITH_AVX=ON
 set ON_INFER=OFF
 set CUDA_ARCH_NAME=All
+set retry_times=4
 
 call :cmake || goto cmake_error
 call :build || goto build_error
@@ -220,6 +191,7 @@ rem ------Build windows no-avx whl package------
 set WITH_AVX=OFF
 set ON_INFER=OFF
 set CUDA_ARCH_NAME=All
+set retry_times=4
 
 call :cmake || goto cmake_error
 call :build || goto build_error
@@ -240,8 +212,10 @@ rem "Other configurations are added here"
 rem :CASE_wincheck_others
 rem call ...
 
+
 rem ---------------------------------------------------------------------------------------------
 :cmake
+@ECHO OFF
 echo    ========================================
 echo    Step 1. Cmake ...
 echo    ========================================
@@ -249,16 +223,58 @@ echo    ========================================
 rem Configure the environment for 64-bit builds. 'DISTUTILS_USE_SDK' indicates that the user has selected the compiler.
 call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"
 set DISTUTILS_USE_SDK=1
+rem Windows 10 Kit bin dir
+set PATH=C:\Program Files (x86)\Windows Kits\10\bin\10.0.17763.0\x64;%PATH%
 
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%
 
-@ECHO ON
-if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0
+if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
 set PATH=%TENSORRT_ROOT:/=\%\lib;%CUDA_TOOLKIT_ROOT_DIR%\bin;%CUDA_TOOLKIT_ROOT_DIR%\libnvvp;%PATH%
 
-rem ------set third_party cache dir------
+rem install ninja if GENERATOR is Ninja
+if %GENERATOR% == "Ninja" (
+    pip install ninja
+    if %errorlevel% NEQ 0 (
+        echo pip install ninja failed!
+        exit /b 7
+    )
+)
+
+rem ------show summary of current GPU environment----------
+cmake --version
+if "%WITH_GPU%"=="ON" (
+    nvcc --version
+    nvidia-smi
+)
+
+rem ------initialize the python environment------
+set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
+set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH%
+if %WITH_PYTHON% == "OFF" (
+    where python
+    where pip
+    pip install wheel --user
+    pip install -r %work_dir%\python\requirements.txt --user
+    if %ERRORLEVEL% NEQ 0 (
+        echo pip install requirements.txt failed!
+        exit /b 7
+    )
+)
 
+rem ------pre install clcache and init config----------
+rem pip install clcache --user
+pip uninstall -y clcache
+:: set USE_CLCACHE to enable clcache
+rem set USE_CLCACHE=1
+:: In some scenarios, CLCACHE_HARDLINK can save one file copy.
+rem set CLCACHE_HARDLINK=1
+:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
+rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
+:: set maximum cache size to 20G
+rem clcache.exe -M 21474836480
+
+rem ------set third_party cache dir------
 : clear third party cache every once in a while
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
 set day_now=%datetime:~6,2%
@@ -342,7 +358,7 @@ if %GENERATOR% == "Ninja" (
 )
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1  
-    if %build_times% GTR 2 (
+    if %build_times% GTR %retry_times% (
         exit /b 7
     ) else (
         echo Build third_party failed, will retry!
@@ -356,6 +372,28 @@ set build_times=1
 :: reset clcache zero stats for collect PR's actual hit rate
 rem clcache.exe -z
 
+rem -------clean up environment again-----------
+taskkill /f /im MSBuild.exe 2>NUL
+taskkill /f /im cl.exe 2>NUL
+taskkill /f /im lib.exe 2>NUL
+taskkill /f /im link.exe 2>NUL
+taskkill /f /im vctip.exe 2>NUL
+taskkill /f /im cvtres.exe 2>NUL
+taskkill /f /im rc.exe 2>NUL
+taskkill /f /im mspdbsrv.exe 2>NUL
+taskkill /f /im csc.exe 2>NUL
+taskkill /f /im nvcc.exe 2>NUL
+taskkill /f /im cicc.exe 2>NUL
+taskkill /f /im ptxas.exe 2>NUL
+taskkill /f /im test_api_impl.exe 2>NUL
+taskkill /f /im op_function_generator.exe 2>NUL
+wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="test_api_impl.exe" call terminate 2>NUL
+wmic process where name="cvtres.exe" call terminate 2>NUL
+wmic process where name="rc.exe" call terminate 2>NUL
+wmic process where name="CL.exe" call terminate 2>NUL
+wmic process where name="Lib.exe" call terminate 2>NUL
+
 echo Build Paddle the %build_times% time:
 if %GENERATOR% == "Ninja" (
     ninja -j %PARALLEL_PROJECT_COUNT%
@@ -369,7 +407,7 @@ if %GENERATOR% == "Ninja" (
 
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
-    if %build_times% GTR 1 (
+    if %build_times% GTR %retry_times% (
         exit /b 7
     ) else (
         echo Build Paddle failed, will retry!
@@ -450,6 +488,16 @@ echo    ========================================
 echo    Step 4. Running unit tests ...
 echo    ========================================
 
+
+: set CI_SKIP_CPP_TEST if only *.py changed
+git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON
+
+pip install -r %work_dir%\python\unittest_py\requirements.txt --user
+if %ERRORLEVEL% NEQ 0 (
+    echo pip install unittest requirements.txt failed!
+    exit /b 7
+)
+
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%
 
@@ -706,9 +754,21 @@ taskkill /f /im git-remote-https.exe 2>NUL
 taskkill /f /im vctip.exe 2>NUL
 taskkill /f /im cvtres.exe 2>NUL
 taskkill /f /im rc.exe 2>NUL
+taskkill /f /im mspdbsrv.exe 2>NUL
+taskkill /f /im csc.exe 2>NUL
+taskkill /f /im python.exe  2>NUL
+taskkill /f /im nvcc.exe 2>NUL
+taskkill /f /im cicc.exe 2>NUL
+taskkill /f /im ptxas.exe 2>NUL
+taskkill /f /im test_api_impl.exe 2>NUL
+taskkill /f /im op_function_generator.exe 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="test_api_impl.exe" call terminate 2>NUL
+wmic process where name="cvtres.exe" call terminate 2>NUL
+wmic process where name="rc.exe" call terminate 2>NUL
+wmic process where name="CL.exe" call terminate 2>NUL
+wmic process where name="Lib.exe" call terminate 2>NUL
 wmic process where name="python.exe" call terminate 2>NUL
-taskkill /f /im python.exe  2>NUL
 echo Windows CI run successfully!
 exit /b 0
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index d834d1f87a273c..0865d48c0d3432 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -145,6 +145,18 @@ function cmake_base() {
             else
                 exit 1
             fi
+        elif [ "$1" == "cp39-cp39" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.9" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.9/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.9/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.9/include/python3.9/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/libpython3.9.dylib"
+                pip3.9 install --user -r ${PADDLE_ROOT}/python/requirements.txt
+            else
+                exit 1
+            fi
         fi
     else
         if [ "$1" != "" ]; then
@@ -205,6 +217,13 @@ function cmake_base() {
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.8.0/include/python3.8
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.8.0/lib/libpython3.so"
                 pip3.8 install -r ${PADDLE_ROOT}/python/requirements.txt
+            elif [ "$1" == "cp39-cp39" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.9.0/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.9.0/bin/python3.9
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.9.0/include/python3.9
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.9.0/lib/libpython3.so"
+                pip3.9 install -r ${PADDLE_ROOT}/python/requirements.txt
            elif [ "$1" == "conda-python3.7" ]; then
                 export LD_LIBRARY_PATH=/opt/conda/lib/:${LD_LIBRARY_PATH}
                 export PATH=/opt/conda/bin/:${PATH}
@@ -227,7 +246,6 @@ function cmake_base() {
     fi
 
     distibuted_flag=${WITH_DISTRIBUTE:-OFF}
-    grpc_flag="OFF"
     gloo_flag=${distibuted_flag}
 
     cat <<EOF
@@ -249,13 +267,11 @@ function cmake_base() {
         -DWITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF}
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
         -DPY_VERSION=${PY_VERSION:-2.7}
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
-        -DWITH_GRPC=${grpc_flag}
         -DWITH_PSCORE=${distibuted_flag}
         -DWITH_GLOO=${gloo_flag}
         -DWITH_LITE=${WITH_LITE:-OFF}
@@ -263,6 +279,7 @@ function cmake_base() {
         -DLITE_GIT_TAG=release/v2.8
         -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF}
         -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF}
+        -DWITH_STRIP=${WITH_STRIP:-ON}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -292,7 +309,6 @@ EOF
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
-        -DWITH_GRPC=${grpc_flag} \
         -DWITH_PSCORE=${distibuted_flag} \
         -DWITH_GLOO=${gloo_flag} \
         -DLITE_GIT_TAG=release/v2.8 \
@@ -300,6 +316,7 @@ EOF
         -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} \
         -DWITH_LITE=${WITH_LITE:-OFF} \
         -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} \
+        -DWITH_STRIP=${WITH_STRIP:-ON} \
         -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF};build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
@@ -521,8 +538,7 @@ function run_brpc_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     if [[ ${WITH_TESTING:-ON} == "ON" \
-        && ${WITH_DISTRIBUTE:-OFF} == "ON" \
-        && ${WITH_GRPC:-OFF} == "OFF" ]] ; then
+        && ${WITH_DISTRIBUTE:-OFF} == "ON" ]] ; then
     cat <<EOF
     ========================================
     Running brpc unit tests ...
@@ -585,6 +601,8 @@ EOF
             pip3.7 uninstall -y paddlepaddle
         elif [ "$1" == "cp38-cp38" ]; then
             pip3.8 uninstall -y paddlepaddle
+        elif [ "$1" == "cp39-cp39" ]; then
+            pip3.9 uninstall -y paddlepaddle
         fi
         set -ex
 
@@ -600,6 +618,8 @@ EOF
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         elif [ "$1" == "cp38-cp38" ]; then
             pip3.8 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        elif [ "$1" == "cp39-cp39" ]; then
+            pip3.9 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
         tmpfile_rand=`date +%s%N`
         tmpfile=$tmp_dir/$tmpfile_rand
@@ -607,7 +627,7 @@ EOF
         ut_startTime_s=`date +%s`
         get_quickly_disable_ut||disable_ut_quickly='' # indicate whether the case was in quickly disable list 
         if [ ${NIGHTLY_MODE:-OFF} == "ON" ]; then
-            nightly_label=""
+            nightly_label="(NIGHTLY_LABEL)"
         else
             nightly_label="(RUN_TYPE=NIGHTLY|RUN_TYPE=DIST:NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY)"
             echo "========================================="
@@ -685,28 +705,23 @@ EOF
         echo "ipipe_log_param_Mac_TestCases_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
         paddle version
         # Recovery proxy to avoid failure in later steps
-        set +x
         export http_proxy=$my_proxy
         export https_proxy=$my_proxy
         if [ "$mactest_error" != 0 ];then
             show_ut_retry_result
         fi
-        set -x
     fi
 }
 
 function get_precision_ut_mac() {
     on_precision=0
-    set -x
     UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d')
     precison_cases=""
     if [ ${PRECISION_TEST:-OFF} == "ON" ]; then
         python3.7 $PADDLE_ROOT/tools/get_pr_ut.py
         if [[ -f "ut_list" ]]; then
-            set +x
             echo "PREC length: "`wc -l ut_list`
             precision_cases=`cat ut_list`
-            set -x
         fi
     fi
     if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]];then
@@ -803,7 +818,7 @@ function generate_api_spec() {
 
     awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc
     awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api
-    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" ]; then
+    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" || [ "$1" == "cp39-cp39" ]; then
         # Use sed to make python2 and python3 sepc keeps the same
         sed -i 's/arg0: str/arg0: unicode/g' $spec_path
         sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" $spec_path
@@ -848,7 +863,7 @@ function check_approvals_of_unittest() {
                 echo -e "If you have any problems about deleting unit-test, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/Deleting-unit-test-is-forbidden]. \n"
                 echo -e "Following unit-tests are deleted in this PR: \n ${unittest_spec_diff} \n"
                 echo "************************************"
-                exit 1
+                exit 6
             fi
         fi
     fi
@@ -1435,6 +1450,7 @@ function parallel_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     pip install ${PADDLE_ROOT}/build/python/dist/*whl
+    cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python
     if [ "$WITH_GPU" == "ON" ] || [ "$WITH_ROCM" == "ON" ];then
         parallel_test_base_gpu
     else
@@ -1540,12 +1556,14 @@ EOF
     ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
     ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
+    ref_paddle39=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp39-cp39-linux_x86_64.whl
 
     ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
     ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
+    ref_paddle39_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp39-cp39-linux_x86_64.whl
 
     if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then
         ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
@@ -1553,11 +1571,13 @@ EOF
         ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
         ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
         ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
+        ref_paddle39=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp39-cp39-linux_x86_64.whl
         ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
         ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
         ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
         ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
         ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
+        ref_paddle39_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp39-cp39-linux_x86_64.whl
     fi
 
     #ref_paddle2_mv1=""
@@ -1678,6 +1698,22 @@ EOF
         apt-get clean -y && \
         rm -f ${ref_paddle38} && \
         ldconfig
+EOF
+    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    # run paddle version to install python packages first
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+        libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+        xz-utils tk-dev libffi-dev liblzma-dev
+    RUN wget -q https://www.python.org/ftp/python/3.9.0/Python-3.9.0.tgz && \
+        tar -xzf Python-3.9.0.tgz && cd Python-3.9.0 && \
+        CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+        make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.9.0.tgz
+    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
+        wget ${ref_web}/${ref_paddle39} && pip3.9 install ${ref_paddle39_whl}; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f ${ref_paddle39} && \
+        ldconfig
 EOF
     cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
     # run paddle version to install python packages first
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index a886f7a0298373..1f11be7e3c7260 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -29,8 +29,7 @@ int main(int argc, char** argv) {
 
   std::vector<std::string> envs;
   std::vector<std::string> undefok;
-#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_GRPC) && \
-    !defined(PADDLE_WITH_PSLIB)
+#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_PSLIB)
   std::string str_max_body_size;
   if (::GFLAGS_NAMESPACE::GetCommandLineOption("max_body_size",
                                                &str_max_body_size)) {
diff --git a/patches/eigen/TensorReductionGpu.h b/patches/eigen/TensorReductionGpu.h
new file mode 100644
index 00000000000000..696078e54881af
--- /dev/null
+++ b/patches/eigen/TensorReductionGpu.h
@@ -0,0 +1,996 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+// clang-format off
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+
+namespace Eigen {
+namespace internal {
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple gpu thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another gpu thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  if (sizeof(T) == 4)
+  {
+    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+    unsigned int newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned int readback;
+    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else if (sizeof(T) == 8) {
+    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+    unsigned long long newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned long long readback;
+    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else {
+    gpu_assert(0 && "Wordsize not supported");
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+  return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <template <typename T> class R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
+  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+  unsigned int newval = oldval;
+  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+  if (newval == oldval) {
+    return;
+  }
+  unsigned int readback;
+  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+    oldval = readback;
+    newval = oldval;
+    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+  }
+}
+// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
+template <template <typename T> class R>
+__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum,
+                                    R<half>& reducer) {
+  half2* houtput=reinterpret_cast<half2*>(output);
+  half2* haccum=reinterpret_cast<half2*>(&accum);
+  for(int i=0;i<4;++i){
+    atomicReduce(houtput+i,*(haccum+i),reducer);
+  }
+}
+#endif  // EIGEN_HAS_GPU_FP16
+
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  atomicAdd(output, accum);
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+
+template <typename CoeffType, typename Index>
+__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+    output[i] = val;
+  }
+}
+
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  // Initialize the output value
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      *output = reducer.initialize();
+    }
+  }
+  else {
+    if (threadIdx.x == 0) {
+      unsigned int block = atomicCAS(semaphore, 0u, 1u);
+      if (block == 0) {
+        // We're the first block to run, initialize the output value
+        atomicExchCustom(output, reducer.initialize());
+        __threadfence();
+        atomicExch(semaphore, 2u);
+      }
+      else {
+        // Wait for the first block to initialize the output value.
+        // Use atomicCAS here to ensure that the reads aren't cached
+        unsigned int val;
+        do {
+          val = atomicCAS(semaphore, 2u, 2u);
+        }
+        while (val < 2u);
+      }
+    }
+  }
+
+  __syncthreads();
+
+  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
+
+  typename Self::CoeffReturnType accum = reducer.initialize();
+  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
+  for (Index i = 0; i < max_iter; i+=BlockSize) {
+    const Index index = first_index + i;
+    eigen_assert(index < num_coeffs);
+    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+    reducer.reduce(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+  #if defined(EIGEN_HIPCC)
+    // use std::is_floating_point to determine the type of reduced_val 
+    // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
+    // and list the float and int versions of __shfl_down as the candidate functions. 
+    if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
+      reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
+    } else {
+      reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
+    }
+  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
+  #else
+    reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
+  #endif
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(output, accum, reducer);
+  }
+
+  if (gridDim.x > 1 && threadIdx.x == 0) {
+    // Let the last block reset the semaphore
+    atomicInc(semaphore, gridDim.x + 1);
+#if defined(EIGEN_HIPCC)
+    __threadfence_system();
+#endif
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+                                                      packet_traits<Eigen::half>::type* scratch) {
+  eigen_assert(blockDim.x == 1);
+  eigen_assert(gridDim.x == 1);
+  typedef packet_traits<Eigen::half>::type packet_type;
+  Index packet_remainder =
+      num_coeffs % Index(unpacket_traits<packet_type>::size);
+  if (packet_remainder != 0) {
+    half2* h2scratch = reinterpret_cast<half2*>(scratch);
+    for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
+      *h2scratch =
+          __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1));
+      h2scratch++;
+    }
+    if ((num_coeffs & 1) != 0) {
+      half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
+      *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
+    }
+  } else {
+    *scratch = reducer.template initializePacket<packet_type>();
+  }
+}
+
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+
+  const Index num_packets =
+      num_coeffs / Index(unpacket_traits<PacketType>::size);
+  PacketType* p_output = reinterpret_cast<PacketType*>(output);
+  for (Index i = thread_id; i < num_packets; i += num_threads) {
+    p_output[i] = reducer.template initializePacket<PacketType>();
+  }
+  Index packet_remainder =
+      num_coeffs % Index(unpacket_traits<PacketType>::size);
+  if (thread_id < packet_remainder) {
+    output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
+  }
+}
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+                                    half* output, packet_traits<Eigen::half>::type* scratch) {
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+  const int packet_width = unpacket_traits<PacketType>::size;
+  eigen_assert(NumPerThread % packet_width == 0);
+  const Index first_index =
+      blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
+
+  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      int rem = num_coeffs % packet_width;
+      if (rem != 0) {
+        half2* p_scratch = reinterpret_cast<half2*>(scratch);
+        *scratch = reducer.template initializePacket<PacketType>();
+        for (int i = 0; i < rem / 2; i++) {
+          *p_scratch = __halves2half2(
+              input.m_impl.coeff(num_coeffs - packet_width + 2 * i),
+              input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1));
+          p_scratch++;
+        }
+        if ((num_coeffs & 1) != 0) {
+          half last = input.m_impl.coeff(num_coeffs - 1);
+          *p_scratch = __halves2half2(last, reducer.initialize());
+        }
+      } else {
+        *scratch = reducer.template initializePacket<PacketType>();
+      }
+    }
+    __syncthreads();
+  }
+
+  PacketType accum = reducer.template initializePacket<PacketType>();
+  const Index max_iter =
+      numext::mini<Index>((num_coeffs - first_index) / packet_width,
+                          NumPerThread * BlockSize / packet_width);
+  for (Index i = 0; i < max_iter; i += BlockSize) {
+    const Index index = first_index + packet_width * i;
+    eigen_assert(index + packet_width < num_coeffs);
+    PacketType val = input.m_impl.template packet<Unaligned>(index);
+    reducer.reducePacket(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+  #if defined(EIGEN_HIPCC)
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+      union { int i; half2 h; } wka_in, wka_out;
+      wka_in.h = hacc[i];
+      wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
+      hr[i] = wka_out.h;
+    }
+    reducer.reducePacket(r1, &accum);
+  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      hr[i] = __shfl_down(hacc[i], offset, warpSize);
+    }
+    reducer.reducePacket(r1, &accum);
+  #else
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
+    }
+    reducer.reducePacket(r1, &accum);
+
+  #endif
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(scratch, accum, reducer);
+  }
+
+  __syncthreads();
+  half2* rv1 = reinterpret_cast<half2*>(scratch);
+  if (packet_width > 2) {
+    reducer.reducePacket(rv1[2], rv1);
+    reducer.reducePacket(rv1[3], rv1 + 1);
+    reducer.reducePacket(rv1[1], rv1);
+  }
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      half tmp = __low2half(*rv1);
+      reducer.reduce(__high2half(*rv1), &tmp);
+      *output = tmp;
+    }
+  }
+}
+
+template <typename Op>
+__global__ void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
+  eigen_assert(threadIdx.x == 1);
+  half2* pscratch = reinterpret_cast<half2*>(scratch);
+  half tmp = __float2half(0.f);
+  typedef packet_traits<Eigen::half>::type packet_type;
+  for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
+    reducer.reduce(__low2half(*pscratch), &tmp);
+    reducer.reduce(__high2half(*pscratch), &tmp);
+    pscratch++;
+  }
+  *output = tmp;
+}
+
+#endif // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+    gpu_assert(false && "Should only be called on doubles, floats and half floats");
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    typename internal::enable_if<
+      internal::is_same<float, OutputType>::value ||
+      internal::is_same<double, OutputType>::value,
+    void>::type> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
+
+    typedef typename Self::Index Index;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+
+    unsigned int* semaphore = NULL;
+    if (num_blocks > 1) {
+      semaphore = device.semaphore();
+    }
+
+    LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
+  }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+    gpu_assert(false && "Should not be called since there is no packet accessor");
+  }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+    typedef typename packet_traits<Eigen::half>::type PacketType;
+
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    PacketType* scratch = static_cast<PacketType*>(device.scratchpad());
+    // half2* scratch = static_cast<half2*>(device.scratchpad());
+
+    if (num_blocks > 1) {
+      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_coeffs, scratch);
+    }
+
+    LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
+
+    if (num_blocks > 1) {
+      LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
+                         1, 1, 0, device, reducer, output, scratch);
+    }
+  }
+};
+#endif // EIGEN_HAS_GPU_FP16
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple cases
+  // of doubles, floats and half floats
+#ifdef EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else // EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif // EIGEN_HAS_GPU_FP16
+
+  template <typename OutputType>
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return;
+    }
+
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+  }
+};
+
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                         typename Self::CoeffReturnType* output) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  typedef typename Self::CoeffReturnType Type;
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = i / input_col_blocks;
+
+    if (row < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
+
+      Type reduced_val = reducer.initialize();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
+        if (last_col >= num_coeffs_to_reduce) {
+          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
+            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            reducer.reduce(val, &reduced_val);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k);
+            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+      #if defined(EIGEN_HIPCC)
+        // use std::is_floating_point to determine the type of reduced_val 
+       // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
+       // and list the float and int versions of __shfl_down as the candidate functions. 
+        if (std::is_floating_point<Type>::value) {
+          reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
+        } else {
+          reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
+        }
+      #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+      #else
+        reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
+      #endif
+      }
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        atomicReduce(&(output[row]), reduced_val, reducer);
+      }
+    }
+  }
+#else // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif // EIGEN_CUDA_ARCH >= 300
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                              half* output) {
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+  const int packet_width = unpacket_traits<PacketType>::size;
+  const int unroll_times = 16 / packet_width;
+  eigen_assert(NumPerThread % unroll_times == 0);
+  eigen_assert(unroll_times % 2 == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
+  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    Index i = packet_width * thread_id;
+    for (; i + packet_width <= num_preserved_coeffs;
+         i += packet_width * num_threads) {
+      PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
+      *poutput = reducer.template initializePacket<PacketType>();
+    }
+    if (i < num_preserved_coeffs) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = 2 * (i / input_col_blocks);  // everybody takes 2 rows
+
+    if (row + 1 < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin =
+          packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
+
+      PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
+      PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col =
+            col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
+        if (last_col >= num_coeffs_to_reduce) {
+          Index col = col_begin + blockDim.x * j;
+          for (; col + packet_width <= num_coeffs_to_reduce;
+               col += blockDim.x) {
+            const PacketType val1 = input.m_impl.template packet<Unaligned>(
+                row * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val1, &reduced_val1);
+            const PacketType val2 = input.m_impl.template packet<Unaligned>(
+                (row + 1) * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          if (col < num_coeffs_to_reduce) {
+            PacketType r1 = reducer.template initializePacket<PacketType>();
+            PacketType r2 = reducer.template initializePacket<PacketType>();
+            half2* hr1 = reinterpret_cast<half2*>(&r1);
+            half2* hr2 = reinterpret_cast<half2*>(&r2);
+            while (col + 1 < num_coeffs_to_reduce) {
+              *hr1 = __halves2half2(
+                  input.m_impl.coeff(row * num_coeffs_to_reduce + col),
+                  input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
+              *hr2 = __halves2half2(
+                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
+                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
+                                     1));
+              hr1++;
+              hr2++;
+              col += 2;
+            }
+            if (col < num_coeffs_to_reduce) {
+              // Peel;
+              const half last1 =
+                  input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+              *hr1 = __halves2half2(last1, reducer.initialize());
+              const half last2 =
+                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
+              *hr2 = __halves2half2(last2, reducer.initialize());
+            }
+            reducer.reducePacket(r1, &reduced_val1);
+            reducer.reducePacket(r2, &reduced_val2);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k) * packet_width;
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(
+                                     row * num_coeffs_to_reduce + col),
+                                 &reduced_val1);
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(
+                                     (row + 1) * num_coeffs_to_reduce + col),
+                                 &reduced_val2);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+      #if defined(EIGEN_HIPCC)
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+	  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+	  union { int i; half2 h; } wka_in1, wka_out1;
+	  wka_in1.h = rv1[i];
+	  wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
+	  hr1[i] = wka_out1.h;
+
+	  union { int i; half2 h; } wka_in2, wka_out2;
+	  wka_in2.h = rv2[i];
+	  wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
+	  hr2[i] = wka_out2.h;
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+      #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+          hr1[i] = __shfl_down(rv1[i], offset, warpSize);
+          hr2[i] = __shfl_down(rv2[i], offset, warpSize);
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+      #else
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+          hr1[i] =
+              __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
+          hr2[i] =
+              __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+
+      #endif
+      }
+      half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+      half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+      half2 val;
+      if (packet_width > 2) {
+        reducer.reducePacket(rv1[2], rv1);
+        reducer.reducePacket(rv1[3], rv1 + 1);
+        reducer.reducePacket(rv1[1], rv1);
+        reducer.reducePacket(rv2[2], rv2);
+        reducer.reducePacket(rv2[3], rv2 + 1);
+        reducer.reducePacket(rv2[1], rv2);
+      }
+      half val1 = __low2half(*rv1);
+      reducer.reduce(__high2half(*rv1), &val1);
+      half val2 = __low2half(*rv2);
+      reducer.reduce(__high2half(*rv2), &val2);
+      val = __halves2half2(val1, val2);
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        half* loc = output + row;
+        atomicReduce((half2*)loc, val, reducer);
+      }
+    }
+  }
+}
+
+#endif // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
+    return true;
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+  Self, Op, OutputType, PacketAccess,
+  typename internal::enable_if<
+    internal::is_same<float, OutputType>::value ||
+    internal::is_same<double, OutputType>::value,
+  void>::type> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should not be called since there is no packet accessor");
+    return true;
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    if (num_preserved_vals % 2 != 0) {
+      // Not supported yet, revert to the slower code path
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = /*256*/128;
+    const int num_per_thread = /*128*/64;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+#endif // EIGEN_HAS_GPU_FP16
+
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats and half floats.
+#ifdef EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else // EIGEN_HAS_GPU_FP16
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif // EIGEN_HAS_GPU_FP16
+
+  template <typename OutputType>
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return true;
+    }
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 128) {
+      return true;
+    }
+
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+  }
+};
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                     typename Self::CoeffReturnType* output) {
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  // Do the reduction.
+  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
+  for (Index i = thread_id; i < max_iter; i += num_threads) {
+    const Index input_col = i % num_preserved_coeffs;
+    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
+    typename Self::CoeffReturnType reduced_val = reducer.initialize();
+    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
+    for (Index j = input_row; j < max_row; j++) {
+      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
+      reducer.reduce(val, &reduced_val);
+    }
+    atomicReduce(&(output[input_col]), reduced_val, reducer);
+  }
+}
+
+
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats.
+  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+  template <typename Device, typename OutputType>
+  static
+    #if !defined(EIGEN_HIPCC)
+    // FIXME :  leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
+    //          (in the cxx11_tensor_reduction_gpu test)
+    //
+    // terminate called after throwing an instance of 'std::runtime_error'
+    //   what():  No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
+    //
+    // don't know why this happens (and why is it a runtime error instead of a compile time error)
+    //
+    // this will be fixed by HIP PR#457
+    EIGEN_DEVICE_FUNC
+    #endif
+    bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
+    return true;
+  }
+
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 32) {
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 16;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs in the reduction kernel itself when we don't have to worry
+      // about race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumGpuMultiProcessors() *
+                             device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, double* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    if (num_coeffs_to_reduce <= 32) {
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 16;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() *
+                           device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumGpuMultiProcessors() *
+                             device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<double, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+// clang-format on
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 9b03cd08ba97a2..b493ecedd9651e 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -43,9 +43,20 @@ set(FLUID_DST_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/)
 IF(WIN32)
     # Python would use the .pyd by default under Windows series platform
     set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.pyd)
-    set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.pyd)
+    set(FLUID_CORE_LIB ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.lib)
+    
+    add_custom_command(OUTPUT ${FLUID_CORE}
+      COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+      COMMAND cmake -E copy $<TARGET_LINKER_FILE:paddle_pybind> ${FLUID_CORE_LIB}
+      DEPENDS paddle_pybind)
+
+    set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.pyd)  
 ELSE()
     set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.so)
+    add_custom_command(OUTPUT ${FLUID_CORE}
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
+        DEPENDS paddle_pybind)
+
     set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.so)
 ENDIF()
 
@@ -68,9 +79,6 @@ if(HAS_NOAVX_CORE AND EXISTS "${NOAVX_CORE_FILE}")
   list(APPEND FLUID_CORE_DEPS ${FLUID_NOAVX_CORE})
 endif()
 
-add_custom_command(OUTPUT ${FLUID_CORE}
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-        DEPENDS paddle_pybind)
 add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE_DEPS})
 
 IF(WIN32)
@@ -84,6 +92,7 @@ ELSE(WIN32)
     COMMAND touch stub.cc
     COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    COMMENT "Packing whl packages------>>>"
     DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
 ENDIF()
 
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 861839256a3da7..054fcdfcbe6515 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -11,9 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-import os
-
 try:
     from paddle.version import full_version as __version__
     from paddle.version import commit as __git_commit__
@@ -30,274 +27,471 @@
 from .fluid.dygraph import monkey_patch_math_varbase
 monkey_patch_variable()
 monkey_patch_math_varbase()
-import paddle.framework
-from .framework.dtype import dtype as dtype
-from paddle.framework.dtype import uint8
-from paddle.framework.dtype import int8
-from paddle.framework.dtype import int16
-from paddle.framework.dtype import int32
-from paddle.framework.dtype import int64
-from paddle.framework.dtype import float16
-from paddle.framework.dtype import float32
-from paddle.framework.dtype import float64
-from paddle.framework.dtype import bfloat16
-from paddle.framework.dtype import bool
-from paddle.framework.dtype import complex64
-from paddle.framework.dtype import complex128
-from .framework import VarBase as Tensor
-import paddle.compat
-import paddle.distributed
-import paddle.sysconfig
-import paddle.tensor
-import paddle.distribution
-import paddle.nn
-import paddle.distributed.fleet
-import paddle.optimizer
-import paddle.metric
-import paddle.device
-import paddle.regularizer
-import paddle.incubate
-import paddle.autograd
+from .framework.dtype import dtype as dtype  # noqa: F401
+from paddle.framework.dtype import uint8  # noqa: F401
+from paddle.framework.dtype import int8  # noqa: F401
+from paddle.framework.dtype import int16  # noqa: F401
+from paddle.framework.dtype import int32  # noqa: F401
+from paddle.framework.dtype import int64  # noqa: F401
+from paddle.framework.dtype import float16  # noqa: F401
+from paddle.framework.dtype import float32  # noqa: F401
+from paddle.framework.dtype import float64  # noqa: F401
+from paddle.framework.dtype import bfloat16  # noqa: F401
+from paddle.framework.dtype import bool  # noqa: F401
+from paddle.framework.dtype import complex64  # noqa: F401
+from paddle.framework.dtype import complex128  # noqa: F401
+from .framework import VarBase as Tensor  # noqa: F401
+Tensor.__qualname__ = 'Tensor'  # noqa: F401
+import paddle.compat  # noqa: F401
+import paddle.distributed  # noqa: F401
+import paddle.sysconfig  # noqa: F401
+import paddle.distribution  # noqa: F401
+import paddle.nn  # noqa: F401
+import paddle.distributed.fleet  # noqa: F401
+import paddle.optimizer  # noqa: F401
+import paddle.metric  # noqa: F401
+import paddle.regularizer  # noqa: F401
+import paddle.incubate  # noqa: F401
+import paddle.autograd  # noqa: F401
 
-# TODO: define alias in tensor and framework directory
+import paddle.jit  # noqa: F401
+import paddle.amp  # noqa: F401
+import paddle.dataset  # noqa: F401
+import paddle.inference  # noqa: F401
+import paddle.io  # noqa: F401
+import paddle.onnx  # noqa: F401
+import paddle.reader  # noqa: F401
+import paddle.static  # noqa: F401
+import paddle.vision  # noqa: F401
 
-from .tensor.random import randperm
-from .tensor.random import bernoulli
+from .tensor.random import bernoulli  # noqa: F401
 
-from .tensor.attribute import rank  #DEFINE_ALIAS
-from .tensor.attribute import shape  #DEFINE_ALIAS
-from .tensor.attribute import real  #DEFINE_ALIAS
-from .tensor.attribute import imag  #DEFINE_ALIAS
-from .tensor.creation import to_tensor  #DEFINE_ALIAS
-from .tensor.creation import diag  #DEFINE_ALIAS
-from .tensor.creation import eye  #DEFINE_ALIAS
-# from .tensor.creation import fill_constant  #DEFINE_ALIAS
-# from .tensor.creation import get_tensor_from_selected_rows        #DEFINE_ALIAS
-from .tensor.creation import linspace  #DEFINE_ALIAS
-from .tensor.creation import ones  #DEFINE_ALIAS
-from .tensor.creation import ones_like  #DEFINE_ALIAS
-from .tensor.creation import zeros  #DEFINE_ALIAS
-from .tensor.creation import zeros_like  #DEFINE_ALIAS
-from .tensor.creation import arange  #DEFINE_ALIAS
-from .tensor.creation import eye  #DEFINE_ALIAS
-from .tensor.creation import full  #DEFINE_ALIAS
-from .tensor.creation import full_like  #DEFINE_ALIAS
-from .tensor.creation import triu  #DEFINE_ALIAS
-from .tensor.creation import tril  #DEFINE_ALIAS
-from .tensor.creation import meshgrid  #DEFINE_ALIAS
-from .tensor.creation import empty  #DEFINE_ALIAS
-from .tensor.creation import empty_like  #DEFINE_ALIAS
-from .tensor.creation import assign  #DEFINE_ALIAS
-from .tensor.linalg import matmul  #DEFINE_ALIAS
-from .tensor.linalg import dot  #DEFINE_ALIAS
-# from .tensor.linalg import einsum        #DEFINE_ALIAS
-from .tensor.linalg import norm  #DEFINE_ALIAS
-from .tensor.linalg import transpose  #DEFINE_ALIAS
-from .tensor.linalg import dist  #DEFINE_ALIAS
-from .tensor.linalg import t  #DEFINE_ALIAS
-from .tensor.linalg import cross  #DEFINE_ALIAS
-from .tensor.linalg import cholesky  #DEFINE_ALIAS
-# from .tensor.linalg import tensordot        #DEFINE_ALIAS
-from .tensor.linalg import bmm  #DEFINE_ALIAS
-from .tensor.linalg import histogram  #DEFINE_ALIAS
-from .tensor.linalg import mv  #DEFINE_ALIAS
-from .tensor.logic import equal  #DEFINE_ALIAS
-from .tensor.logic import greater_equal  #DEFINE_ALIAS
-from .tensor.logic import greater_than  #DEFINE_ALIAS
-from .tensor.logic import is_empty  #DEFINE_ALIAS
-#from .tensor.logic import isfinite  #DEFINE_ALIAS
-from .tensor.logic import less_equal  #DEFINE_ALIAS
-from .tensor.logic import less_than  #DEFINE_ALIAS
-from .tensor.logic import logical_and  #DEFINE_ALIAS
-from .tensor.logic import logical_not  #DEFINE_ALIAS
-from .tensor.logic import logical_or  #DEFINE_ALIAS
-from .tensor.logic import logical_xor  #DEFINE_ALIAS
-from .tensor.logic import not_equal  #DEFINE_ALIAS
-from .tensor.logic import allclose  #DEFINE_ALIAS
-from .tensor.logic import equal_all  #DEFINE_ALIAS
-# from .tensor.logic import isnan        #DEFINE_ALIAS
-from .tensor.logic import is_tensor  #DEFINE_ALIAS
-from .tensor.manipulation import cast  #DEFINE_ALIAS
-from .tensor.manipulation import concat  #DEFINE_ALIAS
-from .tensor.manipulation import expand  #DEFINE_ALIAS
-from .tensor.manipulation import broadcast_to  #DEFINE_ALIAS
-from .tensor.manipulation import expand_as  #DEFINE_ALIAS
-from .tensor.manipulation import tile  #DEFINE_ALIAS
-from .tensor.manipulation import flatten  #DEFINE_ALIAS
-from .tensor.manipulation import gather  #DEFINE_ALIAS
-from .tensor.manipulation import gather_nd  #DEFINE_ALIAS
-from .tensor.manipulation import reshape  #DEFINE_ALIAS
-from .tensor.manipulation import reshape_  #DEFINE_ALIAS
-from .tensor.manipulation import flip as reverse  #DEFINE_ALIAS
-from .tensor.manipulation import scatter  #DEFINE_ALIAS
-from .tensor.manipulation import scatter_  #DEFINE_ALIAS
-from .tensor.manipulation import scatter_nd_add  #DEFINE_ALIAS
-from .tensor.manipulation import scatter_nd  #DEFINE_ALIAS
-from .tensor.manipulation import shard_index  #DEFINE_ALIAS
-from .tensor.manipulation import slice  #DEFINE_ALIAS
-from .tensor.manipulation import split  #DEFINE_ALIAS
-from .tensor.manipulation import squeeze  #DEFINE_ALIAS
-from .tensor.manipulation import squeeze_  #DEFINE_ALIAS
-from .tensor.manipulation import stack  #DEFINE_ALIAS
-from .tensor.manipulation import strided_slice  #DEFINE_ALIAS
-from .tensor.manipulation import transpose  #DEFINE_ALIAS
-from .tensor.manipulation import unique  #DEFINE_ALIAS
-from .tensor.manipulation import unsqueeze  #DEFINE_ALIAS
-from .tensor.manipulation import unsqueeze_  #DEFINE_ALIAS
-from .tensor.manipulation import unstack  #DEFINE_ALIAS
-from .tensor.manipulation import flip  #DEFINE_ALIAS
-from .tensor.manipulation import unbind  #DEFINE_ALIAS
-from .tensor.manipulation import roll  #DEFINE_ALIAS
-from .tensor.manipulation import chunk  #DEFINE_ALIAS
-from .tensor.math import abs  #DEFINE_ALIAS
-from .tensor.math import acos  #DEFINE_ALIAS
-from .tensor.math import asin  #DEFINE_ALIAS
-from .tensor.math import atan  #DEFINE_ALIAS
-from .tensor.math import ceil  #DEFINE_ALIAS
-from .tensor.math import cos  #DEFINE_ALIAS
-from .tensor.math import tan  #DEFINE_ALIAS
-from .tensor.math import cosh  #DEFINE_ALIAS
-from .tensor.math import cumsum  #DEFINE_ALIAS
-# from .tensor.math import elementwise_add  #DEFINE_ALIAS
-# from .tensor.math import elementwise_div  #DEFINE_ALIAS
-# from .tensor.math import elementwise_floordiv  #DEFINE_ALIAS
-# from .tensor.math import elementwise_mod  #DEFINE_ALIAS
-# from .tensor.math import elementwise_pow  #DEFINE_ALIAS
-# from .tensor.math import elementwise_sub  #DEFINE_ALIAS
-from .tensor.math import exp  #DEFINE_ALIAS
-from .tensor.math import floor  #DEFINE_ALIAS
-from .tensor.math import increment  #DEFINE_ALIAS
-from .tensor.math import log  #DEFINE_ALIAS
-from .tensor.math import log2  #DEFINE_ALIAS
-from .tensor.math import log10  #DEFINE_ALIAS
-from .tensor.math import multiplex  #DEFINE_ALIAS
-from .tensor.math import pow  #DEFINE_ALIAS
-from .tensor.math import reciprocal  #DEFINE_ALIAS
-# from .tensor.math import reduce_max  #DEFINE_ALIAS
-# from .tensor.math import reduce_min  #DEFINE_ALIAS
-# from .tensor.math import reduce_prod  #DEFINE_ALIAS
-# from .tensor.math import reduce_sum  #DEFINE_ALIAS
-from .tensor.math import all  #DEFINE_ALIAS
-from .tensor.math import any  #DEFINE_ALIAS
-from .tensor.math import round  #DEFINE_ALIAS
-from .tensor.math import rsqrt  #DEFINE_ALIAS
-from .tensor.math import scale  #DEFINE_ALIAS
-from .tensor.math import sign  #DEFINE_ALIAS
-from .tensor.math import sin  #DEFINE_ALIAS
-from .tensor.math import sinh  #DEFINE_ALIAS
-from .tensor.math import sqrt  #DEFINE_ALIAS
-from .tensor.math import square  #DEFINE_ALIAS
-from .tensor.math import stanh  #DEFINE_ALIAS
-from .tensor.math import sum  #DEFINE_ALIAS
-from .tensor.math import tanh  #DEFINE_ALIAS
-from .tensor.math import tanh_  #DEFINE_ALIAS
-from .tensor.math import add_n  #DEFINE_ALIAS
-from .tensor.math import max  #DEFINE_ALIAS
-from .tensor.math import maximum  #DEFINE_ALIAS
-from .tensor.math import min  #DEFINE_ALIAS
-from .tensor.math import minimum  #DEFINE_ALIAS
-from .tensor.math import mm  #DEFINE_ALIAS
-from .tensor.math import divide  #DEFINE_ALIAS
-from .tensor.math import floor_divide  #DEFINE_ALIAS
-from .tensor.math import remainder  #DEFINE_ALIAS
-from .tensor.math import mod  #DEFINE_ALIAS
-from .tensor.math import floor_mod  #DEFINE_ALIAS
-from .tensor.math import multiply  #DEFINE_ALIAS
-from .tensor.math import add  #DEFINE_ALIAS
-from .tensor.math import subtract  #DEFINE_ALIAS
-from .tensor.math import atan  #DEFINE_ALIAS
-from .tensor.math import logsumexp  #DEFINE_ALIAS
-from .tensor.math import inverse  #DEFINE_ALIAS
-from .tensor.math import log1p  #DEFINE_ALIAS
-from .tensor.math import erf  #DEFINE_ALIAS
-from .tensor.math import addmm  #DEFINE_ALIAS
-from .tensor.math import clip  #DEFINE_ALIAS
-from .tensor.math import trace  #DEFINE_ALIAS
-from .tensor.math import kron  #DEFINE_ALIAS
-from .tensor.math import isfinite  #DEFINE_ALIAS
-from .tensor.math import isinf  #DEFINE_ALIAS
-from .tensor.math import isnan  #DEFINE_ALIAS
-from .tensor.math import prod  #DEFINE_ALIAS
-from .tensor.math import broadcast_shape  #DEFINE_ALIAS
-from .tensor.math import conj  #DEFINE_ALIAS
+from .tensor.attribute import rank  # noqa: F401
+from .tensor.attribute import shape  # noqa: F401
+from .tensor.attribute import real  # noqa: F401
+from .tensor.attribute import imag  # noqa: F401
+from .tensor.creation import to_tensor  # noqa: F401
+from .tensor.creation import diag  # noqa: F401
+from .tensor.creation import eye  # noqa: F401
+from .tensor.creation import linspace  # noqa: F401
+from .tensor.creation import ones  # noqa: F401
+from .tensor.creation import ones_like  # noqa: F401
+from .tensor.creation import zeros  # noqa: F401
+from .tensor.creation import zeros_like  # noqa: F401
+from .tensor.creation import arange  # noqa: F401
+from .tensor.creation import full  # noqa: F401
+from .tensor.creation import full_like  # noqa: F401
+from .tensor.creation import triu  # noqa: F401
+from .tensor.creation import tril  # noqa: F401
+from .tensor.creation import meshgrid  # noqa: F401
+from .tensor.creation import empty  # noqa: F401
+from .tensor.creation import empty_like  # noqa: F401
+from .tensor.creation import assign  # noqa: F401
+from .tensor.linalg import matmul  # noqa: F401
+from .tensor.linalg import dot  # noqa: F401
+from .tensor.linalg import norm  # noqa: F401
+from .tensor.linalg import transpose  # noqa: F401
+from .tensor.linalg import dist  # noqa: F401
+from .tensor.linalg import t  # noqa: F401
+from .tensor.linalg import cross  # noqa: F401
+from .tensor.linalg import cholesky  # noqa: F401
+from .tensor.linalg import bmm  # noqa: F401
+from .tensor.linalg import histogram  # noqa: F401
+from .tensor.linalg import mv  # noqa: F401
+from .tensor.logic import equal  # noqa: F401
+from .tensor.logic import greater_equal  # noqa: F401
+from .tensor.logic import greater_than  # noqa: F401
+from .tensor.logic import is_empty  # noqa: F401
+from .tensor.logic import less_equal  # noqa: F401
+from .tensor.logic import less_than  # noqa: F401
+from .tensor.logic import logical_and  # noqa: F401
+from .tensor.logic import logical_not  # noqa: F401
+from .tensor.logic import logical_or  # noqa: F401
+from .tensor.logic import logical_xor  # noqa: F401
+from .tensor.logic import not_equal  # noqa: F401
+from .tensor.logic import allclose  # noqa: F401
+from .tensor.logic import equal_all  # noqa: F401
+from .tensor.logic import is_tensor  # noqa: F401
+from .tensor.manipulation import cast  # noqa: F401
+from .tensor.manipulation import concat  # noqa: F401
+from .tensor.manipulation import expand  # noqa: F401
+from .tensor.manipulation import broadcast_to  # noqa: F401
+from .tensor.manipulation import expand_as  # noqa: F401
+from .tensor.manipulation import tile  # noqa: F401
+from .tensor.manipulation import flatten  # noqa: F401
+from .tensor.manipulation import gather  # noqa: F401
+from .tensor.manipulation import gather_nd  # noqa: F401
+from .tensor.manipulation import reshape  # noqa: F401
+from .tensor.manipulation import reshape_  # noqa: F401
+from .tensor.manipulation import flip as reverse  # noqa: F401
+from .tensor.manipulation import scatter  # noqa: F401
+from .tensor.manipulation import scatter_  # noqa: F401
+from .tensor.manipulation import scatter_nd_add  # noqa: F401
+from .tensor.manipulation import scatter_nd  # noqa: F401
+from .tensor.manipulation import shard_index  # noqa: F401
+from .tensor.manipulation import slice  # noqa: F401
+from .tensor.manipulation import split  # noqa: F401
+from .tensor.manipulation import squeeze  # noqa: F401
+from .tensor.manipulation import squeeze_  # noqa: F401
+from .tensor.manipulation import stack  # noqa: F401
+from .tensor.manipulation import strided_slice  # noqa: F401
+from .tensor.manipulation import transpose  # noqa: F401
+from .tensor.manipulation import unique  # noqa: F401
+from .tensor.manipulation import unsqueeze  # noqa: F401
+from .tensor.manipulation import unsqueeze_  # noqa: F401
+from .tensor.manipulation import unstack  # noqa: F401
+from .tensor.manipulation import flip  # noqa: F401
+from .tensor.manipulation import unbind  # noqa: F401
+from .tensor.manipulation import roll  # noqa: F401
+from .tensor.manipulation import chunk  # noqa: F401
+from .tensor.manipulation import tolist  # noqa: F401
+from .tensor.math import abs  # noqa: F401
+from .tensor.math import acos  # noqa: F401
+from .tensor.math import asin  # noqa: F401
+from .tensor.math import atan  # noqa: F401
+from .tensor.math import ceil  # noqa: F401
+from .tensor.math import cos  # noqa: F401
+from .tensor.math import tan  # noqa: F401
+from .tensor.math import cosh  # noqa: F401
+from .tensor.math import cumsum  # noqa: F401
+from .tensor.math import exp  # noqa: F401
+from .tensor.math import floor  # noqa: F401
+from .tensor.math import increment  # noqa: F401
+from .tensor.math import log  # noqa: F401
+from .tensor.math import log2  # noqa: F401
+from .tensor.math import log10  # noqa: F401
+from .tensor.math import multiplex  # noqa: F401
+from .tensor.math import pow  # noqa: F401
+from .tensor.math import reciprocal  # noqa: F401
+from .tensor.math import all  # noqa: F401
+from .tensor.math import any  # noqa: F401
+from .tensor.math import round  # noqa: F401
+from .tensor.math import rsqrt  # noqa: F401
+from .tensor.math import scale  # noqa: F401
+from .tensor.math import sign  # noqa: F401
+from .tensor.math import sin  # noqa: F401
+from .tensor.math import sinh  # noqa: F401
+from .tensor.math import sqrt  # noqa: F401
+from .tensor.math import square  # noqa: F401
+from .tensor.math import stanh  # noqa: F401
+from .tensor.math import sum  # noqa: F401
+from .tensor.math import tanh  # noqa: F401
+from .tensor.math import tanh_  # noqa: F401
+from .tensor.math import add_n  # noqa: F401
+from .tensor.math import max  # noqa: F401
+from .tensor.math import maximum  # noqa: F401
+from .tensor.math import min  # noqa: F401
+from .tensor.math import minimum  # noqa: F401
+from .tensor.math import mm  # noqa: F401
+from .tensor.math import divide  # noqa: F401
+from .tensor.math import floor_divide  # noqa: F401
+from .tensor.math import remainder  # noqa: F401
+from .tensor.math import mod  # noqa: F401
+from .tensor.math import floor_mod  # noqa: F401
+from .tensor.math import multiply  # noqa: F401
+from .tensor.math import add  # noqa: F401
+from .tensor.math import subtract  # noqa: F401
+from .tensor.math import atan  # noqa: F401
+from .tensor.math import logsumexp  # noqa: F401
+from .tensor.math import inverse  # noqa: F401
+from .tensor.math import log1p  # noqa: F401
+from .tensor.math import erf  # noqa: F401
+from .tensor.math import addmm  # noqa: F401
+from .tensor.math import clip  # noqa: F401
+from .tensor.math import trace  # noqa: F401
+from .tensor.math import kron  # noqa: F401
+from .tensor.math import isfinite  # noqa: F401
+from .tensor.math import isinf  # noqa: F401
+from .tensor.math import isnan  # noqa: F401
+from .tensor.math import prod  # noqa: F401
+from .tensor.math import broadcast_shape  # noqa: F401
+from .tensor.math import conj  # noqa: F401
 
-from .tensor.random import multinomial  #DEFINE_ALIAS
-from .tensor.random import standard_normal
-from .tensor.random import normal
-from .tensor.random import uniform  #DEFINE_ALIAS
-from .tensor.random import randn  #DEFINE_ALIAS
-from .tensor.random import rand  #DEFINE_ALIAS
-from .tensor.random import randint  #DEFINE_ALIAS
-from .tensor.random import randperm  #DEFINE_ALIAS
-from .tensor.search import argmax  #DEFINE_ALIAS
-from .tensor.search import argmin  #DEFINE_ALIAS
-from .tensor.search import argsort  #DEFINE_ALIAS
-# from .tensor.search import has_inf  #DEFINE_ALIAS
-# from .tensor.search import has_nan  #DEFINE_ALIAS
-from .tensor.search import masked_select  #DEFINE_ALIAS
-from .tensor.search import topk  #DEFINE_ALIAS
-from .tensor.search import where  #DEFINE_ALIAS
-from .tensor.search import index_select  #DEFINE_ALIAS
-from .tensor.search import nonzero  #DEFINE_ALIAS
-from .tensor.search import sort  #DEFINE_ALIAS
+from .tensor.random import multinomial  # noqa: F401
+from .tensor.random import standard_normal  # noqa: F401
+from .tensor.random import normal  # noqa: F401
+from .tensor.random import uniform  # noqa: F401
+from .tensor.random import randn  # noqa: F401
+from .tensor.random import rand  # noqa: F401
+from .tensor.random import randint  # noqa: F401
+from .tensor.random import randperm  # noqa: F401
+from .tensor.search import argmax  # noqa: F401
+from .tensor.search import argmin  # noqa: F401
+from .tensor.search import argsort  # noqa: F401
+from .tensor.search import masked_select  # noqa: F401
+from .tensor.search import topk  # noqa: F401
+from .tensor.search import where  # noqa: F401
+from .tensor.search import index_select  # noqa: F401
+from .tensor.search import nonzero  # noqa: F401
+from .tensor.search import sort  # noqa: F401
 
-from .tensor.to_string import set_printoptions  #DEFINE_ALIAS
+from .tensor.to_string import set_printoptions  # noqa: F401
 
-from .framework.random import seed  #DEFINE_ALIAS
-from .framework.random import get_cuda_rng_state  #DEFINE_ALIAS
-from .framework.random import set_cuda_rng_state  #DEFINE_ALIAS
-from .framework import ParamAttr  #DEFINE_ALIAS
-# from .framework import create_global_var  #DEFINE_ALIAS
-from .framework import create_parameter  #DEFINE_ALIAS
-from .framework import CPUPlace  #DEFINE_ALIAS
-from .framework import CUDAPlace  #DEFINE_ALIAS
-from .framework import NPUPlace  #DEFINE_ALIAS
-from .framework import CUDAPinnedPlace  #DEFINE_ALIAS
+from .framework.random import seed  # noqa: F401
+from .framework.random import get_cuda_rng_state  # noqa: F401
+from .framework.random import set_cuda_rng_state  # noqa: F401
+from .framework import ParamAttr  # noqa: F401
+from .framework import create_parameter  # noqa: F401
+from .framework import CPUPlace  # noqa: F401
+from .framework import CUDAPlace  # noqa: F401
+from .framework import NPUPlace  # noqa: F401
+from .framework import CUDAPinnedPlace  # noqa: F401
 
-from .framework import grad  #DEFINE_ALIAS
-from .framework import no_grad  #DEFINE_ALIAS
-from .framework import save  #DEFINE_ALIAS
-from .framework import load  #DEFINE_ALIAS
-from .framework import DataParallel  #DEFINE_ALIAS
+from .framework import grad  # noqa: F401
+from .framework import no_grad  # noqa: F401
+from .framework import set_grad_enabled  # noqa: F401
+from .framework import save  # noqa: F401
+from .framework import load  # noqa: F401
+from .framework import DataParallel  # noqa: F401
 
 from .framework import set_default_dtype  #DEFINE_ALIAS
 from .framework import get_default_dtype  #DEFINE_ALIAS
+from .framework import set_grad_enabled  #DEFINE_ALIAS
 
-from .tensor.search import index_sample  #DEFINE_ALIAS
-from .tensor.stat import mean  #DEFINE_ALIAS
-# from .tensor.stat import reduce_mean  #DEFINE_ALIAS
-from .tensor.stat import std  #DEFINE_ALIAS
-from .tensor.stat import var  #DEFINE_ALIAS
-# from .fluid.data import data
-from .tensor.stat import numel  #DEFINE_ALIAS
-from .tensor.stat import median  #DEFINE_ALIAS
-from .device import get_cudnn_version
-from .device import set_device
-from .device import get_device
-from .device import is_compiled_with_cuda  #DEFINE_ALIAS
-from .device import is_compiled_with_xpu
-from .device import is_compiled_with_npu
-from .device import XPUPlace
-# from .tensor.tensor import Tensor        #DEFINE_ALIAS
-# from .tensor.tensor import LoDTensor        #DEFINE_ALIAS
-# from .tensor.tensor import LoDTensorArray        #DEFINE_ALIAS
-
-from .fluid.dygraph.base import enable_dygraph as disable_static  #DEFINE_ALIAS
-from .fluid.dygraph.base import disable_dygraph as enable_static  #DEFINE_ALIAS
-from .fluid.framework import in_dygraph_mode as in_dynamic_mode  #DEFINE_ALIAS
-from .fluid.layers import crop_tensor as crop  #DEFINE_ALIAS
+from .tensor.search import index_sample  # noqa: F401
+from .tensor.stat import mean  # noqa: F401
+from .tensor.stat import std  # noqa: F401
+from .tensor.stat import var  # noqa: F401
+from .tensor.stat import numel  # noqa: F401
+from .tensor.stat import median  # noqa: F401
+from .device import get_cudnn_version  # noqa: F401
+from .device import set_device  # noqa: F401
+from .device import get_device  # noqa: F401
+from .fluid.framework import is_compiled_with_cuda  # noqa: F401
+from .device import is_compiled_with_xpu  # noqa: F401
+from .device import is_compiled_with_npu  # noqa: F401
+from .device import XPUPlace  # noqa: F401
 
-from . import jit
-from . import static
-from . import amp
-from . import onnx
+from .fluid.dygraph.base import enable_dygraph as disable_static  # noqa: F401
+from .fluid.dygraph.base import disable_dygraph as enable_static  # noqa: F401
+from .fluid.framework import in_dygraph_mode as in_dynamic_mode  # noqa: F401
+from .fluid.layers import crop_tensor as crop  # noqa: F401
 
 # high-level api
-from .hapi import Model
-from .hapi import callbacks
-from .hapi import summary
-from .hapi import flops
-import paddle.text
-import paddle.vision
+from .hapi import Model  # noqa: F401
+from .hapi import callbacks  # noqa: F401
+from .hapi import summary  # noqa: F401
+from .hapi import flops  # noqa: F401
+from .hapi import hub  # noqa: F401
 
+import paddle.text  # noqa: F401
+import paddle.vision  # noqa: F401
+
+from .tensor.random import check_shape  # noqa: F401
 disable_static()
+
+__all__ = [     #noqa
+           'dtype',
+           'uint8',
+           'int8',
+           'int16',
+           'int32',
+           'int64',
+           'float16',
+           'float32',
+           'float64',
+           'bfloat16',
+           'bool',
+           'complex64',
+           'complex128',
+           'addmm',
+           'allclose',
+           't',
+           'add',
+           'subtract',
+           'diag',
+           'isnan',
+           'scatter_nd_add',
+           'unstack',
+           'get_default_dtype',
+           'save',
+           'multinomial',
+           'get_cuda_rng_state',
+           'rank',
+           'empty_like',
+           'eye',
+           'cumsum',
+           'sign',
+           'is_empty',
+           'equal',
+           'equal_all',
+           'is_tensor',
+           'cross',
+           'where',
+           'log1p',
+           'cos',
+           'tan',
+           'mean',
+           'XPUPlace',
+           'mv',
+           'in_dynamic_mode',
+           'min',
+           'any',
+           'slice',
+           'normal',
+           'logsumexp',
+           'full',
+           'unsqueeze',
+           'unsqueeze_',
+           'argmax',
+           'Model',
+           'callbacks',
+           'summary',
+           'flops',
+           'hub',
+           'sort',
+           'split',
+           'logical_and',
+           'full_like',
+           'less_than',
+           'kron',
+           'clip',
+           'Tensor',
+           'crop',
+           'ParamAttr',
+           'stanh',
+           'randint',
+           'assign',
+           'gather',
+           'scale',
+           'zeros',
+           'rsqrt',
+           'squeeze',
+           'squeeze_',
+           'to_tensor',
+           'gather_nd',
+           'isinf',
+           'set_device',
+           'uniform',
+           'floor_divide',
+           'remainder',
+           'floor_mod',
+           'roll',
+           'batch',
+           'max',
+           'norm',
+           'logical_or',
+           'mm',
+           'flip',
+           'histogram',
+           'multiplex',
+           'CUDAPlace',
+           'NPUPlace',
+           'empty',
+           'shape',
+           'real',
+           'imag',
+           'reciprocal',
+           'rand',
+           'less_equal',
+           'triu',
+           'is_compiled_with_cuda',
+           'sin',
+           'dist',
+           'unbind',
+           'meshgrid',
+           'arange',
+           'load',
+           'numel',
+           'median',
+           'inverse',
+           'no_grad',
+           'set_grad_enabled',
+           'mod',
+           'abs',
+           'tril',
+           'pow',
+           'zeros_like',
+           'maximum',
+           'topk',
+           'index_select',
+           'CPUPlace',
+           'matmul',
+           'seed',
+           'acos',
+           'logical_xor',
+           'exp',
+           'bernoulli',
+           'summary',
+           'sinh',
+           'is_compiled_with_xpu',
+           'is_compiled_with_npu',
+           'round',
+           'DataParallel',
+           'argmin',
+           'prod',
+           'broadcast_shape',
+           'conj',
+           'square',
+           'divide',
+           'ceil',
+           'atan',
+           'expand',
+           'broadcast_to',
+           'ones_like',
+           'index_sample',
+           'cast',
+           'grad',
+           'all',
+           'ones',
+           'not_equal',
+           'sum',
+           'tile',
+           'get_device',
+           'greater_equal',
+           'isfinite',
+           'create_parameter',
+           'dot',
+           'increment',
+           'erf',
+           'bmm',
+           'chunk',
+           'tolist',
+           'greater_than',
+           'shard_index',
+           'argsort',
+           'tanh',
+           'tanh_',
+           'transpose',
+           'randn',
+           'strided_slice',
+           'unique',
+           'set_cuda_rng_state',
+           'set_printoptions',
+           'std',
+           'flatten',
+           'asin',
+           'multiply',
+           'disable_static',
+           'masked_select',
+           'var',
+           'trace',
+           'enable_static',
+           'scatter_nd',
+           'set_default_dtype',
+           'expand_as',
+           'get_cudnn_version',
+           'stack',
+           'sqrt',
+           'cholesky',
+           'randperm',
+           'linspace',
+           'reshape',
+           'reshape_',
+           'reverse',
+           'nonzero',
+           'CUDAPinnedPlace',
+           'logical_not',
+           'add_n',
+           'minimum',
+           'ComplexTensor',
+           'scatter',
+           'scatter_',
+           'floor',
+           'cosh',
+           'log',
+           'log2',
+           'log10',
+           'concat',
+           'check_shape'
+]
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 441bc31b93684f..b83f81b27d1a07 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -28,10 +28,10 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
 
     Args:
         enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
-        custom_white_list(set|list, optional): The custom white_list. It's the set of ops that support
+        custom_white_list(set|list|tuple, optional): The custom white_list. It's the set of ops that support
              fp16 calculation and are considered numerically-safe and performance-critical. These ops 
              will be converted to fp16.
-        custom_black_list(set|list, optional): The custom black_list. The set of ops that support fp16
+        custom_black_list(set|list|tuple, optional): The custom black_list. The set of ops that support fp16
              calculation and are considered numerically-dangerous and whose effects may also be 
              observed in downstream ops. These ops will not be converted to fp16.
         
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index 64b34ce8345635..72a67a92c49586 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -62,6 +62,7 @@ class GradScaler(AmpScaler):
             scaled = scaler.scale(loss)  # scale the loss 
             scaled.backward()            # do backward
             scaler.minimize(optimizer, scaled)  # update parameters     
+            optimizer.clear_grad()
     """
 
     def __init__(self,
@@ -105,6 +106,7 @@ def scale(self, var):
                 scaled = scaler.scale(loss)  # scale the loss 
                 scaled.backward()            # do backward
                 scaler.minimize(optimizer, scaled)  # update parameters  
+                optimizer.clear_grad()
         """
         return super(GradScaler, self).scale(var)
 
@@ -140,5 +142,6 @@ def minimize(self, optimizer, *args, **kwargs):
                 scaled = scaler.scale(loss)  # scale the loss 
                 scaled.backward()            # do backward
                 scaler.minimize(optimizer, scaled)  # update parameters  
+                optimizer.clear_grad()
         """
         return super(GradScaler, self).minimize(optimizer, *args, **kwargs)
diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py
index c093565dc92fff..35e2cd24391775 100644
--- a/python/paddle/autograd/py_layer.py
+++ b/python/paddle/autograd/py_layer.py
@@ -176,8 +176,9 @@ def backward(ctx, dy):
 
 class PyLayerBackward(PyLayerContext):
     def backward(self, *args, **kwargs):
-        with paddle.fluid.dygraph.no_grad():
-            return self._forward_cls.backward(*args, **kwargs)
+        with paddle.fluid.dygraph.guard():
+            with paddle.fluid.dygraph.no_grad():
+                return self._forward_cls.backward(*args, **kwargs)
 
 
 class LayerMeta(type):
diff --git a/python/paddle/check_import_scipy.py b/python/paddle/check_import_scipy.py
index 0172d568e5b086..d6e13e2a670856 100644
--- a/python/paddle/check_import_scipy.py
+++ b/python/paddle/check_import_scipy.py
@@ -24,6 +24,6 @@ def check_import_scipy(OsName):
             if 'DLL load failed' in print_info:
                 raise ImportError(
                     print_info +
-                    "\nplease download visual C++ Redistributable for vs 2015, https://www.microsoft.com/en-us/download/details.aspx?id=48145"
+                    "\nplease download Visual C++ Redistributable from https://support.microsoft.com/en-us/topic/the-latest-supported-visual-c-downloads-2647da03-1eea-4433-9aff-95f26a218cc0"
                 )
     return
diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py
index 2db867d7a7acbf..4b71ff6ac66f1e 100644
--- a/python/paddle/dataset/__init__.py
+++ b/python/paddle/dataset/__init__.py
@@ -15,18 +15,18 @@
 Dataset package.
 """
 
-import paddle.dataset.mnist
-import paddle.dataset.imikolov
-import paddle.dataset.imdb
-import paddle.dataset.cifar
-import paddle.dataset.movielens
-import paddle.dataset.conll05
-import paddle.dataset.uci_housing
-import paddle.dataset.wmt14
-import paddle.dataset.wmt16
-import paddle.dataset.flowers
-import paddle.dataset.voc2012
-import paddle.dataset.image
+import paddle.dataset.mnist  # noqa: F401
+import paddle.dataset.imikolov  # noqa: F401
+import paddle.dataset.imdb  # noqa: F401
+import paddle.dataset.cifar  # noqa: F401
+import paddle.dataset.movielens  # noqa: F401
+import paddle.dataset.conll05  # noqa: F401
+import paddle.dataset.uci_housing  # noqa: F401
+import paddle.dataset.wmt14  # noqa: F401
+import paddle.dataset.wmt16  # noqa: F401
+import paddle.dataset.flowers  # noqa: F401
+import paddle.dataset.voc2012  # noqa: F401
+import paddle.dataset.image  # noqa: F401
 
 # set __all__ as empty for not showing APIs under paddle.dataset
 __all__ = []
diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py
index 2ee95c3723b3ab..9a9f9018e42161 100644
--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -37,7 +37,7 @@
 import six
 from six.moves import cPickle as pickle
 
-__all__ = ['train100', 'test100', 'train10', 'test10']
+__all__ = []
 
 URL_PREFIX = 'https://dataset.bj.bcebos.com/cifar/'
 CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
@@ -79,6 +79,7 @@ def reader():
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.Cifar100",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train100():
     """
@@ -98,6 +99,7 @@ def train100():
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.Cifar100",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test100():
     """
@@ -117,6 +119,7 @@ def test100():
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.Cifar10",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train10(cycle=False):
     """
@@ -139,6 +142,7 @@ def train10(cycle=False):
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.Cifar10",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test10(cycle=False):
     """
@@ -161,6 +165,7 @@ def test10(cycle=False):
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.Cifar10",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index 2884fa0ce5e3d0..2a476f63862cfa 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -26,13 +26,7 @@
 import six.moves.cPickle as pickle
 import glob
 
-__all__ = [
-    'DATA_HOME',
-    'download',
-    'md5file',
-    'split',
-    'cluster_files_reader',
-]
+__all__ = []
 
 HOME = os.path.expanduser('~')
 DATA_HOME = os.path.join(HOME, '.cache', 'paddle', 'dataset')
diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py
index e7176626ca2d18..f09163ea424b0e 100644
--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -30,7 +30,7 @@
 import paddle.utils.deprecated as deprecated
 from six.moves import zip, range
 
-__all__ = ['test, get_dict', 'get_embedding']
+__all__ = []
 
 DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
@@ -206,6 +206,7 @@ def reader():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Conll05st",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def get_dict():
     """
@@ -223,6 +224,7 @@ def get_dict():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Conll05st",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def get_embedding():
     """
@@ -234,6 +236,7 @@ def get_embedding():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Conll05st",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test():
     """
@@ -258,6 +261,7 @@ def test():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Conll05st",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index e16ea6e561eae3..2f38c563136d3b 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -35,7 +35,12 @@
 import functools
 from .common import download
 import tarfile
-from paddle.dataset.image import *
+
+from paddle.dataset.image import load_image_bytes
+from paddle.dataset.image import load_image
+from paddle.dataset.image import simple_transform
+from paddle.dataset.image import batch_images_from_tar
+
 from paddle.reader import map_readers, xmap_readers
 from paddle import compat as cpt
 import paddle.utils.deprecated as deprecated
@@ -45,7 +50,8 @@
 import six
 from six.moves import cPickle as pickle
 from paddle.utils import try_import
-__all__ = ['train', 'test', 'valid']
+
+__all__ = []
 
 DATA_URL = 'http://paddlemodels.bj.bcebos.com/flowers/102flowers.tgz'
 LABEL_URL = 'http://paddlemodels.bj.bcebos.com/flowers/imagelabels.mat'
@@ -150,6 +156,7 @@ def reader():
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.Flowers",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     '''
@@ -183,6 +190,7 @@ def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.Flowers",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     '''
@@ -216,6 +224,7 @@ def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.Flowers",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
     '''
diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py
index 09b5607252bda6..c20672c2ce1577 100644
--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -58,11 +58,7 @@
 import tarfile
 import six.moves.cPickle as pickle
 
-__all__ = [
-    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
-    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
-    "batch_images_from_tar"
-]
+__all__ = []
 
 
 def _check_cv2():
diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py
index dab3c964cc6b73..961d238b0ad416 100644
--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -30,7 +30,7 @@
 import string
 import six
 
-__all__ = ['build_dict', 'train', 'test']
+__all__ = []
 
 #URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
 URL = 'https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz'
@@ -80,6 +80,7 @@ def build_dict(pattern, cutoff):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Imdb",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def reader_creator(pos_pattern, neg_pattern, word_idx):
     UNK = word_idx['<unk>']
@@ -102,6 +103,7 @@ def reader():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Imdb",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train(word_idx):
     """
@@ -123,6 +125,7 @@ def train(word_idx):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Imdb",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test(word_idx):
     """
@@ -144,6 +147,7 @@ def test(word_idx):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Imdb",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def word_dict():
     """
@@ -159,6 +163,7 @@ def word_dict():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Imdb",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(URL, 'imdb', MD5)
diff --git a/python/paddle/dataset/imikolov.py b/python/paddle/dataset/imikolov.py
index cc8e95fc342c28..85fe011fa143a2 100644
--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -27,7 +27,7 @@
 import tarfile
 import six
 
-__all__ = ['train', 'test', 'build_dict']
+__all__ = []
 
 #URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
 URL = 'https://dataset.bj.bcebos.com/imikolov%2Fsimple-examples.tgz'
@@ -115,6 +115,7 @@ def reader():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Imikolov",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train(word_idx, n, data_type=DataType.NGRAM):
     """
@@ -139,6 +140,7 @@ def train(word_idx, n, data_type=DataType.NGRAM):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Imikolov",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test(word_idx, n, data_type=DataType.NGRAM):
     """
@@ -163,6 +165,7 @@ def test(word_idx, n, data_type=DataType.NGRAM):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Imikolov",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(URL, "imikolov", MD5)
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index 14e54d593bbe7e..02cdd307083929 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -26,7 +26,8 @@
 import numpy
 import struct
 from six.moves import range
-__all__ = ['train', 'test']
+
+__all__ = []
 
 URL_PREFIX = 'https://dataset.bj.bcebos.com/mnist/'
 TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
@@ -92,6 +93,7 @@ def reader():
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.MNIST",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train():
     """
@@ -113,6 +115,7 @@ def train():
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.MNIST",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test():
     """
@@ -133,6 +136,7 @@ def test():
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.MNIST",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py
index f753f405bba1f2..9af06e088ca87e 100644
--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -34,10 +34,7 @@
 import six
 import paddle.compat as cpt
 
-__all__ = [
-    'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
-    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info'
-]
+__all__ = []
 
 age_table = [1, 18, 25, 35, 45, 50, 56]
 
@@ -171,6 +168,7 @@ def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def __reader_creator__(**kwargs):
     return lambda: __reader__(**kwargs)
@@ -183,6 +181,7 @@ def __reader_creator__(**kwargs):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def get_movie_title_dict():
     """
@@ -202,6 +201,7 @@ def __max_index_info__(a, b):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def max_movie_id():
     """
@@ -214,6 +214,7 @@ def max_movie_id():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def max_user_id():
     """
@@ -233,6 +234,7 @@ def __max_job_id_impl__(a, b):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def max_job_id():
     """
@@ -246,6 +248,7 @@ def max_job_id():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def movie_categories():
     """
@@ -258,6 +261,7 @@ def movie_categories():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def user_info():
     """
@@ -270,6 +274,7 @@ def user_info():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def movie_info():
     """
@@ -291,6 +296,7 @@ def unittest():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.Movielens",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(URL, "movielens", MD5)
diff --git a/python/paddle/dataset/tests/cifar_test.py b/python/paddle/dataset/tests/cifar_test.py
index 8e514f0fd9a18a..54dff6b40cf3c1 100644
--- a/python/paddle/dataset/tests/cifar_test.py
+++ b/python/paddle/dataset/tests/cifar_test.py
@@ -17,6 +17,8 @@
 import paddle.dataset.cifar
 import unittest
 
+__all__ = []
+
 
 class TestCIFAR(unittest.TestCase):
     def check_reader(self, reader):
diff --git a/python/paddle/dataset/tests/flowers_test.py b/python/paddle/dataset/tests/flowers_test.py
index 06a0a7761cfa10..256c116b7cff65 100644
--- a/python/paddle/dataset/tests/flowers_test.py
+++ b/python/paddle/dataset/tests/flowers_test.py
@@ -17,6 +17,8 @@
 import paddle.dataset.flowers
 import unittest
 
+__all__ = []
+
 
 class TestFlowers(unittest.TestCase):
     def check_reader(self, reader):
diff --git a/python/paddle/dataset/tests/imdb_test.py b/python/paddle/dataset/tests/imdb_test.py
index 613c5f8edb289c..264b0f232fa803 100644
--- a/python/paddle/dataset/tests/imdb_test.py
+++ b/python/paddle/dataset/tests/imdb_test.py
@@ -18,6 +18,8 @@
 import unittest
 import re
 
+__all__ = []
+
 TRAIN_POS_PATTERN = re.compile(r"aclImdb/train/pos/.*\.txt$")
 TRAIN_NEG_PATTERN = re.compile(r"aclImdb/train/neg/.*\.txt$")
 TRAIN_PATTERN = re.compile(r"aclImdb/train/.*\.txt$")
diff --git a/python/paddle/dataset/tests/imikolov_test.py b/python/paddle/dataset/tests/imikolov_test.py
index 1f78a5dd4d1a09..5556274211fc33 100644
--- a/python/paddle/dataset/tests/imikolov_test.py
+++ b/python/paddle/dataset/tests/imikolov_test.py
@@ -19,6 +19,8 @@
 
 WORD_DICT = paddle.dataset.imikolov.build_dict()
 
+__all__ = []
+
 
 class TestMikolov(unittest.TestCase):
     def check_reader(self, reader, n):
diff --git a/python/paddle/dataset/tests/mnist_test.py b/python/paddle/dataset/tests/mnist_test.py
index fbb5d926494e38..238b58244e147a 100644
--- a/python/paddle/dataset/tests/mnist_test.py
+++ b/python/paddle/dataset/tests/mnist_test.py
@@ -17,6 +17,8 @@
 import paddle.dataset.mnist
 import unittest
 
+__all__ = []
+
 
 class TestMNIST(unittest.TestCase):
     def check_reader(self, reader):
diff --git a/python/paddle/dataset/tests/test_image.py b/python/paddle/dataset/tests/test_image.py
index 32d2eb17ae673e..259939d62f6413 100644
--- a/python/paddle/dataset/tests/test_image.py
+++ b/python/paddle/dataset/tests/test_image.py
@@ -19,6 +19,8 @@
 
 import paddle.dataset.image as image
 
+__all__ = []
+
 
 class Image(unittest.TestCase):
     def test_resize_flip_chw(self):
diff --git a/python/paddle/dataset/tests/voc2012_test.py b/python/paddle/dataset/tests/voc2012_test.py
index cddeb91cab2c0f..21c24e6df823fb 100644
--- a/python/paddle/dataset/tests/voc2012_test.py
+++ b/python/paddle/dataset/tests/voc2012_test.py
@@ -17,6 +17,8 @@
 import paddle.dataset.voc2012
 import unittest
 
+__all__ = []
+
 
 class TestVOC(unittest.TestCase):
     def check_reader(self, reader):
diff --git a/python/paddle/dataset/tests/wmt16_test.py b/python/paddle/dataset/tests/wmt16_test.py
index be121bb1012196..68a9819c8f3350 100644
--- a/python/paddle/dataset/tests/wmt16_test.py
+++ b/python/paddle/dataset/tests/wmt16_test.py
@@ -17,6 +17,8 @@
 import paddle.dataset.wmt16
 import unittest
 
+__all__ = []
+
 
 class TestWMT16(unittest.TestCase):
     def checkout_one_sample(self, sample):
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
index daed62fbefba18..dea2dfc8c98183 100644
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -29,7 +29,7 @@
 import paddle.dataset.common
 import paddle.utils.deprecated as deprecated
 
-__all__ = ['train', 'test']
+__all__ = []
 
 URL = 'http://paddlemodels.bj.bcebos.com/uci_housing/housing.data'
 MD5 = 'd4accdce7a25600298819f8e28e8d593'
@@ -87,6 +87,7 @@ def load_data(filename, feature_num=14, ratio=0.8):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.UCIHousing",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train():
     """
@@ -111,6 +112,7 @@ def reader():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.UCIHousing",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test():
     """
@@ -146,6 +148,7 @@ def fluid_model():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.UCIHousing",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def predict_reader():
     """
@@ -162,6 +165,7 @@ def predict_reader():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.UCIHousing",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(URL, 'uci_housing', MD5)
diff --git a/python/paddle/dataset/voc2012.py b/python/paddle/dataset/voc2012.py
index 5a0ff76aab4fe8..1ab91db2cc36d8 100644
--- a/python/paddle/dataset/voc2012.py
+++ b/python/paddle/dataset/voc2012.py
@@ -25,11 +25,10 @@
 import io
 import numpy as np
 from paddle.dataset.common import download
-from paddle.dataset.image import *
 import paddle.utils.deprecated as deprecated
 from PIL import Image
 
-__all__ = ['train', 'test', 'val']
+__all__ = []
 
 VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
 VOCtrainval_11-May-2012.tar'
@@ -70,6 +69,7 @@ def reader():
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.VOC2012",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train():
     """
@@ -81,6 +81,7 @@ def train():
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.VOC2012",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test():
     """
@@ -92,6 +93,7 @@ def test():
 @deprecated(
     since="2.0.0",
     update_to="paddle.vision.datasets.VOC2012",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def val():
     """
diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
index 3bd5e8d5bad46b..9f8abb2c4bfe9e 100644
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -30,11 +30,7 @@
 import paddle.compat as cpt
 import paddle.utils.deprecated as deprecated
 
-__all__ = [
-    'train',
-    'test',
-    'get_dict',
-]
+__all__ = []
 
 URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
                 'cslm_joint_paper/data/dev+test.tgz')
@@ -118,6 +114,7 @@ def reader():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT14",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train(dict_size):
     """
@@ -138,6 +135,7 @@ def train(dict_size):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT14",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test(dict_size):
     """
@@ -158,6 +156,7 @@ def test(dict_size):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT14",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def gen(dict_size):
     return reader_creator(
@@ -168,6 +167,7 @@ def gen(dict_size):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT14",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def get_dict(dict_size, reverse=True):
     # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
@@ -183,6 +183,7 @@ def get_dict(dict_size, reverse=True):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT14",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
index 7f11bc4b1f013b..f313da98f0abc2 100644
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -40,13 +40,7 @@
 import paddle.compat as cpt
 import paddle.utils.deprecated as deprecated
 
-__all__ = [
-    "train",
-    "test",
-    "validation",
-    "fetch",
-    "get_dict",
-]
+__all__ = []
 
 DATA_URL = ("http://paddlemodels.bj.bcebos.com/wmt/wmt16.tar.gz")
 DATA_MD5 = "0c38be43600334966403524a40dcd81e"
@@ -148,6 +142,7 @@ def reader():
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT16",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train(src_dict_size, trg_dict_size, src_lang="en"):
     """
@@ -201,6 +196,7 @@ def train(src_dict_size, trg_dict_size, src_lang="en"):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT16",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test(src_dict_size, trg_dict_size, src_lang="en"):
     """
@@ -254,6 +250,7 @@ def test(src_dict_size, trg_dict_size, src_lang="en"):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT16",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def validation(src_dict_size, trg_dict_size, src_lang="en"):
     """
@@ -305,6 +302,7 @@ def validation(src_dict_size, trg_dict_size, src_lang="en"):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT16",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def get_dict(lang, dict_size, reverse=False):
     """
@@ -339,6 +337,7 @@ def get_dict(lang, dict_size, reverse=False):
 @deprecated(
     since="2.0.0",
     update_to="paddle.text.datasets.WMT16",
+    level=1,
     reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     """download the entire dataset.
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index c882e94d2bade8..7427219285c200 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -12,46 +12,62 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import spawn
-from .spawn import spawn
-
-from . import parallel
-from .parallel import init_parallel_env
-from .parallel import get_rank
-from .parallel import get_world_size
-from paddle.fluid.dygraph.parallel import ParallelEnv  #DEFINE_ALIAS
-from paddle.distributed.fleet.dataset import *
-
-from . import collective
-from .collective import *
-
-from .entry_attr import ProbabilityEntry
-from .entry_attr import CountFilterEntry
-
-# start multiprocess apis
-__all__ = ["spawn"]
-
-# dygraph parallel apis
-__all__ += [
-    "init_parallel_env",
-    "get_rank",
-    "get_world_size",
-    "ParallelEnv",
-    "InMemoryDataset",
-    "QueueDataset",
-]
+from .spawn import spawn  # noqa: F401
 
-# dataset reader
-__all__ += [
-    "InMemoryDataset",
-    "QueueDataset",
-]
+from .parallel import init_parallel_env  # noqa: F401
+from .parallel import get_rank  # noqa: F401
+from .parallel import get_world_size  # noqa: F401
 
-# entry for embedding
-__all__ += [
-    "ProbabilityEntry",
-    "CountFilterEntry",
-]
+from paddle.distributed.fleet.dataset import InMemoryDataset  # noqa: F401
+from paddle.distributed.fleet.dataset import QueueDataset  # noqa: F401
+
+from .collective import broadcast  # noqa: F401
+from .collective import all_reduce  # noqa: F401
+from .collective import reduce  # noqa: F401
+from .collective import all_gather  # noqa: F401
+from .collective import scatter  # noqa: F401
+from .collective import barrier  # noqa: F401
+from .collective import ReduceOp  # noqa: F401
+from .collective import split  # noqa: F401
+from .collective import new_group  # noqa: F401
+from .collective import alltoall  # noqa: F401
+from .collective import recv  # noqa: F401
+from .collective import get_group  # noqa: F401
+from .collective import send  # noqa: F401
+from .collective import wait  # noqa: F401
+
+from .fleet import BoxPSDataset  # noqa: F401
 
-# collective apis
-__all__ += collective.__all__
+from .entry_attr import ProbabilityEntry  # noqa: F401
+from .entry_attr import CountFilterEntry  # noqa: F401
+
+from paddle.fluid.dygraph.parallel import ParallelEnv  # noqa: F401
+
+from . import cloud_utils  # noqa: F401
+from . import utils  # noqa: F401
+
+__all__ = [     #noqa
+      "spawn",
+      "scatter",
+      "broadcast",
+      "ParallelEnv",
+      "new_group",
+      "init_parallel_env",
+      "QueueDataset",
+      "split",
+      "CountFilterEntry",
+      "get_world_size",
+      "get_group",
+      "all_gather",
+      "InMemoryDataset",
+      "barrier",
+      "all_reduce",
+      "alltoall",
+      "send",
+      "reduce",
+      "recv",
+      "ReduceOp",
+      "wait",
+      "get_rank",
+      "ProbabilityEntry"
+]
diff --git a/python/paddle/distributed/cloud_utils.py b/python/paddle/distributed/cloud_utils.py
index 962ba62b15f4a5..34e55bf164673f 100644
--- a/python/paddle/distributed/cloud_utils.py
+++ b/python/paddle/distributed/cloud_utils.py
@@ -14,7 +14,12 @@
 
 import os
 import paddle
-from paddle.distributed.utils import get_cluster, logger, get_gpus, get_cluster_from_args
+from paddle.distributed.utils import get_cluster
+from paddle.distributed.utils import logger
+from paddle.distributed.utils import get_gpus
+from paddle.distributed.utils import get_cluster_from_args
+
+__all__ = []
 
 
 def get_cloud_cluster(args_node_ips, args_node_ip, args_port, selected_devices):
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 2756dea72e84a9..ba4c3b09f9ff7f 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -15,8 +15,14 @@
 import numpy as np
 import os
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_
-from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ..fluid.framework import Variable
+from ..fluid.framework import OpProtoHolder
+from ..fluid.framework import in_dygraph_mode
+from ..fluid.framework import convert_np_dtype_to_dtype_
+from ..fluid.data_feeder import convert_dtype
+from ..fluid.data_feeder import check_variable_and_dtype
+from ..fluid.data_feeder import check_type
+from ..fluid.data_feeder import check_dtype
 from ..fluid.layers.tensor import fill_constant
 from ..fluid.layers import utils
 from ..fluid.dygraph.parallel import prepare_context
@@ -25,19 +31,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 
-__all__ = [
-    'wait',
-    'new_group',
-    'get_group',
-    'broadcast',
-    'all_reduce',
-    'reduce',
-    'all_gather',
-    'scatter',
-    'barrier',
-    'split',
-    'ReduceOp',
-]
+__all__ = []
 
 
 class ReduceOp:
@@ -160,6 +154,46 @@ def get_group(id=0):
     return gm[group] if group in gm else None
 
 
+def barrier(group=None):
+    """
+
+    Barrier among all participators in the group.
+
+    Args:
+        group (Group): The group instance return by new_group or None for global default group.
+
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.distributed import init_parallel_env
+
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            init_parallel_env()
+            paddle.distributed.barrier()
+    """
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
+
+    op_type = 'barrier'
+    temp = fill_constant([1], dtype="int32", value="1")
+    if in_dygraph_mode():
+        return core.ops.barrier(temp, temp, 'ring_id', ring_id)
+    if not isinstance(ring_id, int):
+        raise ValueError("The type of 'group' for barrier must be int.")
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [temp]},
+        outputs={'Out': [temp]},
+        attrs={'ring_id': ring_id})
+
+
 def new_group(ranks=None, backend=None):
     """
 
@@ -220,7 +254,8 @@ def new_group(ranks=None, backend=None):
         core.NCCLParallelContext(strategy, place).init_with_ring_id(ring_id)
     else:
         assert False, ("no cuda device found")
-
+    # need to barrier to construct group
+    barrier(gp)
     return gp
 
 
@@ -397,23 +432,22 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True):
         return
 
     ring_id = 0 if group is None else group.id
-
     if in_dygraph_mode():
         if op == ReduceOp.SUM:
-            return core.ops.c_allreduce_sum(tensor, tensor, 'use_calc_stream',
-                                            use_calc_stream, 'ring_id', ring_id)
+            return core.ops.c_allreduce_sum_(
+                tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id)
         elif op == ReduceOp.MAX:
-            return core.ops.c_allreduce_max(tensor, tensor, 'use_calc_stream',
-                                            use_calc_stream, 'ring_id', ring_id)
+            return core.ops.c_allreduce_max_(
+                tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id)
         elif op == ReduceOp.MIN:
-            return core.ops.c_allreduce_min(tensor, tensor, 'use_calc_stream',
-                                            use_calc_stream, 'ring_id', ring_id)
+            return core.ops.c_allreduce_min_(
+                tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id)
         elif op == ReduceOp.PROD:
-            return core.ops.c_allreduce_prod(tensor, tensor, 'use_calc_stream',
-                                             use_calc_stream, 'ring_id',
-                                             ring_id)
+            return core.ops.c_allreduce_prod_(
+                tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id)
         else:
             raise ValueError("Unknown parameter: {}.".format(op))
+        return out
 
     check_variable_and_dtype(
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
@@ -620,7 +654,7 @@ def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
     Args:
         tensor (Tensor): The output Tensor. Its data type
             should be float16, float32, float64, int32 or int64.
-        tensor_list (list): A list of Tensors to scatter. Every element in the list must be a Tensor whose data type
+        tensor_list (list|tuple): A list/tuple of Tensors to scatter. Every element in the list must be a Tensor whose data type
             should be float16, float32, float64, int32 or int64. Default value is None.
         src (int): The source rank id. Default value is 0.
         group (Group): The group instance return by new_group or None for global default group.
@@ -637,6 +671,8 @@ def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
             import paddle
             from paddle.distributed import init_parallel_env
 
+            # required: gpu
+
             paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
             init_parallel_env()
             if paddle.distributed.ParallelEnv().local_rank == 0:
@@ -692,55 +728,178 @@ def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
         })
 
 
-def barrier(group=None):
+def _c_identity(tensor, group=None):
     """
-
-    Barrier among all participators in the group.
+    Return a copy of the tensor, mainly used with model parallel.
 
     Args:
-        group (Group): The group instance return by new_group or None for global default group.
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        group (int): The id of the process group to work on.
 
     Returns:
-        None.
+        Tensor.
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
 
-    Examples:
-        .. code-block:: python
+    if in_dygraph_mode():
+        return core.ops.c_identity(tensor, 'use_calc_stream', True, 'ring_id',
+                                   ring_id, 'use_model_parallel', True)
+    op_type = 'c_identity'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
 
-            import paddle
-            from paddle.distributed import init_parallel_env
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_identity')
 
-            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
-            init_parallel_env()
-            paddle.distributed.barrier()
+    helper.append_op(
+        type=op_type,
+        inputs={'X': tensor},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': True,
+            'use_model_parallel': True,
+        })
+    return out
+
+
+def _c_concat(tensor, nranks, group=None):
+    """
+    Return allgather of the tensor, mainly used with model parallel.
+
+    Args:
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        Tensor.
     """
     if group is not None and not group.is_member():
         return
+    ring_id = 0 if group is None else group.id
+
+    if in_dygraph_mode():
+        return core.ops.c_concat(tensor, 'ring_id', ring_id, 'use_calc_stream',
+                                 True, 'nranks', nranks, 'use_model_parallel',
+                                 True)
 
+    op_type = 'c_concat'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_concat')
+
+    helper.append_op(
+        type=op_type,
+        inputs={'X': tensor},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': True,
+            'use_model_parallel': True,
+            'nranks': nranks
+        })
+    return out
+
+
+def _c_split(tensor, rank, nranks, group=None):
+    """
+    Split tensor evenly among all members, mainly used with model parallel.
+
+    Args:
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        rank (int): The rank of the current process.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        Tensor.
+    """
+    if group is not None and not group.is_member():
+        return
     ring_id = 0 if group is None else group.id
 
-    op_type = 'barrier'
-    temp = fill_constant([1], dtype="int32", value="1")
     if in_dygraph_mode():
-        return core.ops.barrier(temp, temp, 'ring_id', ring_id)
-    if not isinstance(ring_id, int):
-        raise ValueError("The type of 'group' for barrier must be int.")
+        return core.ops.c_split(tensor, 'use_calc_stream', True, 'ring_id',
+                                ring_id, 'rank', rank, 'nranks', nranks,
+                                'use_model_parallel', True)
+
+    op_type = 'c_split'
     helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_split')
+
     helper.append_op(
         type=op_type,
-        inputs={'X': [temp]},
-        outputs={'Out': [temp]},
-        attrs={'ring_id': ring_id})
+        inputs={'X': tensor},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': True,
+            'rank': rank,
+            'nranks': nranks,
+            'use_model_parallel': True,
+        })
+    return out
+
 
+def _mp_allreduce(tensor,
+                  op=ReduceOp.SUM,
+                  group=None,
+                  use_calc_stream=True,
+                  use_model_parallel=True):
+    """[it is same as allreduce above, but it suuports model parallel. And it support inplace startegy]
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
 
-def _parallel_linear(x, num_rows, num_cols, axis, param_attr, bias_attr,
-                     gather_out, inner_rank, name):
+    if in_dygraph_mode():
+        if op == ReduceOp.SUM:
+            return core.ops.c_allreduce_sum_(
+                tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id,
+                "use_model_parallel", use_model_parallel)
+        else:
+            raise ValueError("Unknown parameter: {}.".format(op))
+    else:
+        raise NotImplementedError("No support _mp_allreduce in dygraph mode.")
+
+
+def _parallel_linear(x,
+                     num_rows,
+                     num_cols,
+                     axis,
+                     param_attr,
+                     bias_attr,
+                     gather_out,
+                     inner_rank,
+                     nranks,
+                     split_tensor,
+                     name,
+                     group=None):
     """
     Parallel Linear
     """
-    if not name:
-        name = "fc_by_row_rank_%d" % inner_rank if axis == 0 else "fc_by_col_rank_%d" % inner_rank
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    if axis == 0:
+        if split_tensor:
+            x = _c_split(x, inner_rank, nranks, group=group)
     else:
-        name = name + "_by_row_rank_%d" % inner_rank if axis == 0 else name + "_by_col_rank_%d" % inner_rank
+        x = _c_identity(x, group=group)
+
     linear = paddle.nn.Linear(
         num_rows,
         num_cols,
@@ -748,33 +907,63 @@ def _parallel_linear(x, num_rows, num_cols, axis, param_attr, bias_attr,
         bias_attr=bias_attr,
         name=name)
 
-    weight = linear.weight
-    weight.is_distributed = True
     linear_out = linear(x)
     startup_block = paddle.static.default_startup_program().global_block()
     main_block = paddle.static.default_main_program().global_block()
-    startup_block.vars[weight.name].is_distributed = True
-    main_block.vars[weight.name].is_distributed = True
-
-    if gather_out:
-        if axis == 0:
-            paddle.distributed.all_reduce(linear_out)
-        else:
-            output = []
-            paddle.distributed.all_gather(output, linear_out)
-            linear_out = paddle.concat(output, axis=len(linear_out.shape) - 1)
-    return linear_out
+    startup_block.vars[linear.weight.name].is_distributed = True
+    main_block.vars[linear.weight.name].is_distributed = True
+
+    if not gather_out: return linear_out
+
+    op_type = 'c_allreduce_sum' if axis == 0 else 'c_concat'
+    out_shape = list(linear_out.shape)
+    out_shape[0] *= 1 if axis == 0 else nranks
+    out = main_block.create_var(
+        shape=out_shape,
+        dtype=linear_out.dtype,
+        type=linear_out.type,
+        lod_level=linear_out.lod_level,
+        persistable=False,
+        is_data=False,
+        need_check_feed=linear_out.desc.need_check_feed())
+    if axis == 0:
+        main_block.append_op(
+            type='c_allreduce_sum',
+            inputs={'X': linear_out},
+            outputs={'Out': out},
+            attrs={
+                'ring_id': ring_id,
+                'use_calc_stream': True,
+                'use_model_parallel': True
+            })
+    else:
+        main_block.append_op(
+            type='c_concat',
+            inputs={'X': linear_out},
+            outputs={'Out': out},
+            attrs={
+                'ring_id': ring_id,
+                'nranks': nranks,
+                'use_calc_stream': True,
+                'use_model_parallel': True
+            })
+    return out
 
 
-def _parallel_embedding(x, per_part_embeddings, origin_size, param_attr,
-                        inner_rank, num_partitions, name):
+def _parallel_embedding(x,
+                        per_part_embeddings,
+                        origin_size,
+                        param_attr,
+                        inner_rank,
+                        num_partitions,
+                        name,
+                        group=None):
     """
     Parallel Embedding
     """
-    if not name:
-        name = "emb_rank_%d" % inner_rank
-    else:
-        name = name + "_rank_%d" % inner_rank
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
 
     origin_num_embeddings = origin_size[0]
     embedding = paddle.nn.Embedding(
@@ -795,15 +984,29 @@ def _parallel_embedding(x, per_part_embeddings, origin_size, param_attr,
                                  inner_rank, per_part_embeddings - 1)
     if len(origin_input_shape) == 2:
         x_shard = paddle.squeeze(x_shard, axis=-1)
-
-    embedding.weight.is_distributed = True
     emb_out = embedding(x_shard)
     startup_block = paddle.static.default_startup_program().global_block()
     main_block = paddle.static.default_main_program().global_block()
     startup_block.vars[embedding.weight.name].is_distributed = True
     main_block.vars[embedding.weight.name].is_distributed = True
-    paddle.distributed.all_reduce(emb_out, group=None)
-    return emb_out
+    out = main_block.create_var(
+        shape=emb_out.shape,
+        dtype=emb_out.dtype,
+        type=emb_out.type,
+        lod_level=emb_out.lod_level,
+        persistable=False,
+        is_data=False,
+        need_check_feed=emb_out.desc.need_check_feed())
+    main_block.append_op(
+        type='c_allreduce_sum',
+        inputs={'X': emb_out},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': True,
+            'use_model_parallel': True
+        })
+    return out
 
 
 def split(x,
@@ -871,10 +1074,12 @@ def split(x,
             import paddle
             from paddle.distributed import init_parallel_env
 
+            # required: gpu
+
             paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
             init_parallel_env()
             data = paddle.randint(0, 8, shape=[10,4])
-            emb_out = padle.distributed.split(
+            emb_out = paddle.distributed.split(
                 data,
                 (8, 8),
                 operation="embedding",
@@ -896,8 +1101,10 @@ def split(x,
         "paddle.distributed.split must be one of {}.".format(
             supported_operations))
     if in_dygraph_mode():
-        rank = paddle.distributed.get_rank()
-        nranks = paddle.distributed.get_world_size()
+        raise ValueError(
+            "paddle.distributed.split cannot be used in dynamic "
+            "graph mode, plese use ParallelEmbedding, ParallelRowLinear, "
+            "ParallelColumnLinear instead.")
     else:
         assert fleet._role_maker, ("To use paddle.distributed.split, "
                                    "you must call fleet.init() firstly.")
@@ -915,10 +1122,18 @@ def split(x,
         if inner_rank == num_partitions - 1: per_part_size = last_part_size
         per_part_size += 1  # make the last row as the padding index
 
-        emb_out = _parallel_embedding(x, per_part_size, size, weight_attr,
-                                      inner_rank, num_partitions, name)
+        emb_out = _parallel_embedding(
+            x,
+            per_part_size,
+            size,
+            weight_attr,
+            inner_rank,
+            num_partitions,
+            name,
+            group=None)
         return emb_out
     else:
+        should_split = False
         if axis == 0:
             assert size[0] % num_partitions == 0, (
                 "Number of rows of the weight for linear ({}) must be"
@@ -926,11 +1141,7 @@ def split(x,
                                                            num_partitions))
             per_part_size = size[0] // num_partitions
             linear_size = (per_part_size, size[1])
-            assert x.shape[-1] == per_part_size, (
-                "The width ({}) of the input "
-                "x must be equal to the height ({}) of the weight. Maybe you "
-                "should split the input x using paddle.split.".format(
-                    x.shape[-1], per_part_size))
+            if x.shape[-1] == size[0]: should_split = True
 
         elif axis == 1:
             assert size[1] % num_partitions == 0, (
@@ -952,5 +1163,181 @@ def split(x,
             bias_attr,
             gather_out,
             inner_rank,
-            name=name)
+            num_partitions,
+            should_split,
+            name=name,
+            group=None)
         return linear_out
+
+
+def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True):
+    """
+    Scatter tensors in in_tensor_list to all participators and gather the result tensors in out_tensor_list.
+    Args:
+        in_tensor_list (list): A list of input Tensors. Every element in the list must be a Tensor whose data type
+            should be float16, float32, float64, int32 or int64.
+        out_tensor_list (Tensor): A list of output Tensors. The data type of its elements should be the same as the
+            data type of the input Tensors.
+        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
+        use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True.
+    Returns:
+        None.
+    Examples:
+        .. code-block:: python
+            # required: distributed
+            import numpy as np
+            import paddle
+            from paddle.distributed import init_parallel_env
+            init_parallel_env()
+            out_tensor_list = []
+            if paddle.distributed.ParallelEnv().rank == 0:
+                np_data1 = np.array([[1, 2, 3], [4, 5, 6]])
+                np_data2 = np.array([[7, 8, 9], [10, 11, 12]])
+            else:
+                np_data1 = np.array([[13, 14, 15], [16, 17, 18]])
+                np_data2 = np.array([[19, 20, 21], [22, 23, 24]])
+            data1 = paddle.to_tensor(np_data1)
+            data2 = paddle.to_tensor(np_data2)
+            paddle.distributed.all_to_all([data1, data2], out_tensor_list)
+            # out for rank 0: [[[1, 2, 3], [4, 5, 6]], [[13, 14, 15], [16, 17, 18]]]
+            # out for rank 1: [[[7, 8, 9], [10, 11, 12]], [[19, 20, 21], [22, 23, 24]]]
+    """
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
+    op_type = 'alltoall'
+    temp = paddle.concat(in_tensor_list, axis=0)
+    helper = LayerHelper(op_type, **locals())
+    nranks = len(in_tensor_list)
+    out = helper.create_variable_for_type_inference(
+        dtype=in_tensor_list[0].dtype)
+    if in_dygraph_mode():
+        core.ops.alltoall_(temp, 'use_calc_stream', use_calc_stream, 'ring_id',
+                           ring_id)
+    else:
+        if not isinstance(in_tensor_list, list):
+            raise ValueError("The type of 'in_tensor_list' for all_to_all "
+                             "should be list.")
+        for elem in in_tensor_list:
+            check_variable_and_dtype(
+                elem, 'in_tensor_list',
+                ['float16', 'float32', 'float64', 'int32', 'int64'],
+                'all_to_all')
+        if not isinstance(out_tensor_list, list):
+            raise ValueError("The type of 'out_tensor_list' for all_to_all "
+                             "should be list.")
+        if len(out_tensor_list) != 0:
+            raise ValueError("The 'out_tensor_list' for all_to_all "
+                             "must be an empty list.")
+        helper.append_op(
+            type=op_type,
+            inputs={'X': [temp]},
+            outputs={'Out': [out]},
+            attrs={
+                'ring_id': group,
+                'use_calc_stream': use_calc_stream,
+            })
+    out_tensor_list.extend(paddle.split(out, nranks, 0))
+
+
+def send(tensor, dst=0, group=None, use_calc_stream=True):
+    """
+    Send a tensor to the receiver.
+
+    Args:
+        tensor (Tensor): The Tensor to send. Its data type
+            should be float16, float32, float64, int32 or int64.
+        dst (int): The destination rank id.
+        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
+        use_calc_stream (bool, optional): Whether to use calculate stream or communication stream. Default: True.
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+            # required: distributed
+            import paddle
+            from paddle.distributed import init_parallel_env
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().rank == 0:
+                data = paddle.to_tensor([7, 8, 9])
+                paddle.distributed.send(data, dst=1)
+            else:
+                data = paddle.to_tensor([1,2,3])
+                paddle.distributed.recv(data, src=0)
+            out = data.numpy()
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    op_type = 'send_v2'
+    if in_dygraph_mode():
+        return core.ops.send_v2(tensor, 'use_calc_stream', use_calc_stream,
+                                'ring_id', ring_id, 'peer', dst)
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'send')
+
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        attrs={
+            'ring_id': ring_id,
+            'peer': dst,
+            'use_calc_stream': use_calc_stream,
+        })
+
+
+def recv(tensor, src=0, group=None, use_calc_stream=True):
+    """
+    Receive a tensor to the sender.
+
+    Args:
+        tensor (Tensor): The Tensor to receive. Its data type
+            should be float16, float32, float64, int32 or int64.
+        src (int): The source rank id.
+        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
+        use_calc_stream (bool, optional): Whether to use calculate stream or communication stream. Default: True.
+    Returns:
+        None.
+
+    Examples:
+        .. code-block:: python
+            # required: distributed
+            import paddle
+            from paddle.distributed import init_parallel_env
+            init_parallel_env()
+            if paddle.distributed.ParallelEnv().rank == 0:
+                data = paddle.to_tensor([7, 8, 9])
+                paddle.distributed.send(data, dst=1)
+            else:
+                data = paddle.to_tensor([1,2,3])
+                paddle.distributed.recv(data, src=0)
+            out = data.numpy()
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    op_type = 'recv_v2'
+    if in_dygraph_mode():
+        return core.ops.recv_v2(tensor, 'use_calc_stream', use_calc_stream,
+                                'ring_id', ring_id, 'peer', src, 'dtype',
+                                tensor.dtype, 'out_shape', tensor.shape)
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'recv')
+    helper = LayerHelper(op_type, **locals())
+    helper.append_op(
+        type=op_type,
+        outputs={'Out': [tensor]},
+        attrs={
+            'ring_id': ring_id,
+            'peer': src,
+            'out_shape': tensor.shape,
+            'dtype': tensor.dtype,
+            'use_calc_stream': use_calc_stream,
+        })
diff --git a/python/paddle/distributed/entry_attr.py b/python/paddle/distributed/entry_attr.py
index dbd899952af03f..e219ef6434a3f1 100644
--- a/python/paddle/distributed/entry_attr.py
+++ b/python/paddle/distributed/entry_attr.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-__all__ = ['ProbabilityEntry', 'CountFilterEntry']
+__all__ = []
 
 
 class EntryAttr(object):
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index 784004269d797b..5f9a61371d34f4 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -13,21 +13,34 @@
 # limitations under the License.
 
 # TODO: define distributed api under this directory,
-from .base.role_maker import Role, UserDefinedRoleMaker, PaddleCloudRoleMaker
-from .base.distributed_strategy import DistributedStrategy
-from .base.fleet_base import Fleet
-from .base.util_factory import UtilBase
-from .dataset import *
-from .data_generator import MultiSlotDataGenerator, MultiSlotStringDataGenerator
-from . import metrics
-from .base.topology import CommunicateTopology, HybridCommunicateGroup
-from .meta_parallel import random, layers
+from .base.role_maker import Role  # noqa: F401
+from .base.role_maker import UserDefinedRoleMaker  # noqa: F401
+from .base.role_maker import PaddleCloudRoleMaker  # noqa: F401
+from .base.distributed_strategy import DistributedStrategy  # noqa: F401
+from .base.fleet_base import Fleet  # noqa: F401
+from .base.util_factory import UtilBase  # noqa: F401
+from .dataset import DatasetBase  # noqa: F401
+from .dataset import InMemoryDataset  # noqa: F401
+from .dataset import QueueDataset  # noqa: F401
+from .dataset import FileInstantDataset  # noqa: F401
+from .dataset import BoxPSDataset  # noqa: F401
+from .data_generator.data_generator import MultiSlotDataGenerator  # noqa: F401
+from .data_generator.data_generator import MultiSlotStringDataGenerator  # noqa: F401
+from . import metrics  # noqa: F401
+from .base.topology import CommunicateTopology
+from .base.topology import HybridCommunicateGroup  # noqa: F401
 
-__all__ = [
-    "DistributedStrategy", "UtilBase", "UserDefinedRoleMaker",
-    "PaddleCloudRoleMaker", "Fleet", "MultiSlotDataGenerator",
-    "MultiSlotStringDataGenerator", "Role", "CommunicateTopology",
-    "HybridCommunicateGroup"
+__all__ = [ #noqa
+      "CommunicateTopology",
+      "UtilBase",
+      "HybridCommunicateGroup",
+      "MultiSlotStringDataGenerator",
+      "UserDefinedRoleMaker",
+      "DistributedStrategy",
+      "Role",
+      "MultiSlotDataGenerator",
+      "PaddleCloudRoleMaker",
+      "Fleet"
 ]
 
 fleet = Fleet()
@@ -74,3 +87,4 @@
 set_state_dict = fleet.set_state_dict
 shrink = fleet.shrink
 get_hybrid_communicate_group = fleet.get_hybrid_communicate_group
+distributed_scaler = fleet.distributed_scaler
diff --git a/python/paddle/distributed/fleet/ascend_utils.py b/python/paddle/distributed/fleet/ascend_utils.py
index 7a4a4a189c92e4..708c76ac55abe8 100644
--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ b/python/paddle/distributed/fleet/ascend_utils.py
@@ -17,6 +17,8 @@
 import paddle
 from paddle.distributed.fleet.launch_utils import get_cluster, logger, get_host_name_ip, DeviceMode
 
+__all__ = []
+
 
 def _get_ascend_rankfile(rank_table_file_path):
     """
@@ -63,7 +65,6 @@ def _get_ascend_rankfile(rank_table_file_path):
     Returns:
         node_ips: node ip list
         device_count: number of npu per machine
-        
     """
     json_data = None
     with open(rank_table_file_path) as json_file:
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 04cb7447e36973..a44d008fe9a313 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -19,7 +19,7 @@
 import google.protobuf.text_format
 import google.protobuf
 
-__all__ = ["DistributedStrategy"]
+__all__ = []
 
 non_auto_func_called = True
 
@@ -744,6 +744,8 @@ def sharding(self):
         idea from [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054).
         Model parameters and Optimizer State are sharded into different ranks allowing to fit larger model.
 
+        In Hybrid parallelism scenario, we use sharding config as uniform API to set each parallelism.
+
         Default value: False
 
         Examples:
@@ -770,29 +772,51 @@ def sharding_configs(self):
         Set sharding configurations. 
 
         **Note**:
-            fuse_broadcast_MB(float): size of a fused group of broadcasted parameters. 
-            This configuration will affect the communication speed in sharding training, 
-            and should be an empirical value decided by your model size and network topology.
+            sharding_segment_strategy(string, optional): strategy used to segment the program(forward & backward operations). two strategise are 
+            available: "segment_broadcast_MB" and "segment_anchors". segment is a concept used in sharding to overlap computation and 
+            communication. Default is segment_broadcast_MB.
+
+            segment_broadcast_MB(float, optional): segment by the parameters broadcast volume. sharding will introduce parameter broadcast operations into program, and 
+            after every segment_broadcast_MB size parameter being broadcasted, the program will be cutted into one segment.
+            This configuration will affect the communication speed in sharding training, and should be an empirical value decided by your model size and network topology.
+            Only enable when sharding_segment_strategy = segment_broadcast_MB. Default is 32.0 .
+
+            segment_anchors(list): list of anchors used to segment the program, which allows a finner control of program segmentation. 
+            this strategy is experimental by now. Only enable when sharding_segment_strategy = segment_anchors.
+
+            sharding_degree(int, optional): specific the number of gpus within each sharding parallelism group; and sharding will be turn off if sharding_degree=1.  Default is 8.
+
+            gradient_merge_acc_step(int, optional): specific the accumulation steps in gradient merge; and gradient merge will be turn off if gradient_merge_acc_step=1.  Default is 1.
+
+            optimize_offload(bool, optional): enable the optimizer offload which will offload the moment vars to Host memory in order to saving GPU memory for fitting larger model. 
+            the moment var will be prefetch from and offloaded to Host memory during update stage. it is a stragtegy that trades off between training speed and GPU memory, and is recommened to be turn on only when gradient_merge_acc_step large, where
+            the number of time of update stage will be relatively small compared with forward&backward's.  Default is False.
 
-            hybrid_dp(bool): enable hybrid data parallelism above the sharding parallelism. 
-            you are supposed to have at least double the number of gpu you have in normal sharding 
-            training to enable this feature.
+            dp_degree(int, optional): specific the number of data parallelism group; when dp_degree >= 2, it will introduce dp_degree ways data parallelism as the outer parallelsim for the inner parallelsim. User is responsible to ensure global_world_size = mp_degree * sharding_degree * pp_degree * dp_degree. Default is 1.
+
+            mp_degree(int, optional): [Hybrid parallelism ONLY] specific the the number of gpus within each megatron parallelism group; and megatron parallelism will turn be off if mp_degree=1.  Default is 1.
+
+            pp_degree(int, optional): [Hybrid parallelism ONLY] specific the the number of gpus within each pipeline parallelism group; and pipeline parallelism will turn be off if pp_degree=1.  Default is 1.
+
+            pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on. 
+            This configuration will affect the communication speed of Hybrid parallelism training depeneded on network topology. this strategy is experimental by now..  Default is False.
 
-            sharding_group_size(int): attribute of hybrid_dp. specific the the number of gpus within
-            each sharding group; and therefore, the number of hybrid data parallelism ways will be equal
-            to (global_size / sharding_group_size).
 
         Examples:
 
           .. code-block:: python
 
+            # sharding-DP, 2 nodes with 8 gpus per node
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.sharding = True
             strategy.sharding_configs = {
-                "fuse_broadcast_MB": 32,
-                "hybrid_dp": True,
-                "sharding_group_size": 8}
+                "sharding_segment_strategy": "segment_broadcast_MB",
+                "segment_broadcast_MB": 32,
+                "sharding_degree": 8,
+                "sharding_degree": 2,
+                "gradient_merge_acc_step": 4,
+                }
         """
         return get_msg_dict(self.strategy.sharding_configs)
 
@@ -845,7 +869,7 @@ def pipeline_configs(self):
         **Notes**:
             **Detailed arguments for pipeline_configs**
 
-            **micro_batch**: the number of small batches in each user defined batch
+            **micro_batch_size**: the number of small batches in each user defined batch
 
         Examples:
 
@@ -854,7 +878,7 @@ def pipeline_configs(self):
             import paddle.distributed.fleet as fleet
             strategy = fleet.DistributedStrategy()
             strategy.pipeline = True
-            strategy.pipeline_configs = {"micro_batch": 12}
+            strategy.pipeline_configs = {"micro_batch_size": 12}
 
         """
 
@@ -867,6 +891,58 @@ def pipeline_configs(self, configs):
                           "pipeline_configs")
         assign_configs_value(self.strategy.pipeline_configs, configs)
 
+    @property
+    def tensor_parallel(self):
+        """
+        Indicating whether we are using tensor parallel for distributed training.
+
+        Examples:
+
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+
+        """
+        return self.strategy.tensor_parallel
+
+    @tensor_parallel.setter
+    @is_strict_auto
+    def tensor_parallel(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.tensor_parallel = flag
+        else:
+            print("WARNING: tensor_parallel should have value of bool type")
+
+    @property
+    def tensor_parallel_configs(self):
+        """
+        Set tensor_parallel configurations.
+
+        **Notes**:
+            **Detailed arguments for tensor_parallel_configs**
+            **tensor_parallel_degree**: degree of tensor parallel
+
+        Examples:
+
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4}
+
+        """
+        return get_msg_dict(self.strategy.tensor_parallel_configs)
+
+    @tensor_parallel_configs.setter
+    @is_strict_auto
+    def tensor_parallel_configs(self, configs):
+        check_configs_key(self.strategy.tensor_parallel_configs, configs,
+                          "tensor_parallel_configs")
+        assign_configs_value(self.strategy.tensor_parallel_configs, configs)
+
     @property
     def hybrid_configs(self):
         """
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 7ed5017b815778..a7564a23a7cfb8 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -27,6 +27,13 @@
 from paddle.fluid.wrapped_decorator import wrap_decorator
 from paddle.fluid.dygraph import parallel_helper
 from . import topology as tp
+from .topology import ParallelMode
+from ..meta_parallel import ModelParallel
+from ..meta_parallel import PipelineParallel
+from ..meta_optimizers import HybridParallelOptimizer
+from ..meta_optimizers import HybridParallelGradScaler
+
+__all__ = []
 
 
 def _inited_runtime_handler_(func):
@@ -219,6 +226,9 @@ def init(self, role_maker=None, is_collective=False, strategy=None):
 
         if paddle.fluid.framework.in_dygraph_mode():
             if self.worker_num() == 1:
+                # if worker_num is 1, should construct default topology & hcg
+                self._topology = tp.CommunicateTopology()
+                self._hcg = tp.HybridCommunicateGroup(self._topology)
                 return
             if parallel_helper._is_parallel_ctx_initialized():
                 warnings.warn(
@@ -694,10 +704,12 @@ def distributed_optimizer(self, optimizer, strategy=None):
 
         self._context = {}
 
-        # TODO(shenliang03): This is a temporary solution to support amp. In the case of a dynamic graph, 
-        # the optimizer is returned directly. This problem will be fixed in the future.
         if paddle.fluid.framework.in_dygraph_mode():
-            return optimizer
+            if self.worker_num() > 1:
+                return HybridParallelOptimizer(optimizer, self._hcg,
+                                               self._user_defined_strategy)
+            else:
+                return optimizer
         return self
 
     @dygraph_only
@@ -756,15 +768,25 @@ def forward(self, x):
 
 
         """
-        assert model is not None
-        self.model = paddle.DataParallel(
-            model,
-            comm_buffer_size=self._user_defined_strategy.fuse_grad_size_in_MB,
-            last_comm_buffer_size=self._user_defined_strategy.
-            last_comm_group_size_MB,
-            find_unused_parameters=self._user_defined_strategy.
-            find_unused_parameters)
-        return self.model
+        assert model is not None, "model should not be None"
+        if self.worker_num() <= 1:
+            return model
+        if self._hcg.get_parallel_mode() == ParallelMode.DATA_PARALLEL:
+            distributed_model = paddle.DataParallel(
+                model,
+                comm_buffer_size=self._user_defined_strategy.
+                fuse_grad_size_in_MB,
+                last_comm_buffer_size=self._user_defined_strategy.
+                last_comm_group_size_MB,
+                find_unused_parameters=self._user_defined_strategy.
+                find_unused_parameters)
+        elif self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL:
+            distributed_model = ModelParallel(
+                model, self._hcg, strategy=self._user_defined_strategy)
+        elif self._hcg.get_parallel_mode() == ParallelMode.PIPELINE_PARALLEL:
+            distributed_model = PipelineParallel(
+                model, self._hcg, strategy=self._user_defined_strategy)
+        return distributed_model
 
     @dygraph_only
     def state_dict(self):
@@ -1026,6 +1048,28 @@ def forward(self, x):
         # imitate target optimizer retrieval
         return self.user_defined_optimizer.clear_grad()
 
+    def _get_amp_optimizer(self):
+        # imitate target optimizer retrieval
+        amp_optimizer = None
+        for optimizer in self.strategy_compiler._get_applied_meta_optimizer():
+            if hasattr(optimizer, 'amp_init'):
+                amp_optimizer = optimizer
+                break
+
+        if amp_optimizer is None:
+            if hasattr(self.user_defined_optimizer, 'amp_init'):
+                amp_optimizer = self.user_defined_optimizer
+
+        assert amp_optimizer is not None, \
+            "amp_init can only be used when the amp(auto mixed precision) strategy is turned on."
+        return amp_optimizer
+
+    def get_loss_scaling(self):
+        """Return the real-time loss scaling factor.
+        """
+        amp_optimizer = self._get_amp_optimizer()
+        return amp_optimizer.get_loss_scaling()
+
     def amp_init(self,
                  place,
                  scope=None,
@@ -1086,21 +1130,7 @@ def run_example_code():
                 if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
                     run_example_code()       
         """
-
-        # imitate target optimizer retrieval
-        amp_optimizer = None
-        for optimizer in self.strategy_compiler._get_applied_meta_optimizer():
-            if hasattr(optimizer, 'amp_init'):
-                amp_optimizer = optimizer
-                break
-
-        if amp_optimizer is None:
-            if hasattr(self.user_defined_optimizer, 'amp_init'):
-                amp_optimizer = self.user_defined_optimizer
-
-        assert amp_optimizer is not None, \
-            "amp_init can only be used when the amp(auto mixed precision) strategy is turned on."
-
+        amp_optimizer = self._get_amp_optimizer()
         return amp_optimizer.amp_init(place, scope, test_program, use_fp16_test)
 
     def _final_strategy(self):
@@ -1312,3 +1342,7 @@ def minimize(self,
         fleet.util._set_strategy(context["valid_strategy"])
 
         return optimize_ops, params_grads
+
+    @dygraph_only
+    def distributed_scaler(self, scaler):
+        return HybridParallelGradScaler(scaler, self._hcg)
diff --git a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
index f845b3fcd8953c..52eeebd0c126c2 100755
--- a/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
+++ b/python/paddle/distributed/fleet/base/meta_optimizer_factory.py
@@ -14,9 +14,15 @@
 
 from ..meta_optimizers import *
 
+__all__ = []
+
 meta_optimizer_names = list(
     filter(lambda name: name.endswith("Optimizer"), dir()))
 
+# Because HybridParallelOptimizer is dygraph optimizer, it 
+# should be removed
+meta_optimizer_names.remove("HybridParallelOptimizer")
+
 
 class MetaOptimizerFactory(object):
     def __init__(self):
diff --git a/python/paddle/distributed/fleet/base/private_helper_function.py b/python/paddle/distributed/fleet/base/private_helper_function.py
index 6b3232b93b2241..c7ddd33d5d0187 100644
--- a/python/paddle/distributed/fleet/base/private_helper_function.py
+++ b/python/paddle/distributed/fleet/base/private_helper_function.py
@@ -17,6 +17,8 @@
 from contextlib import closing
 from six import string_types
 
+__all__ = []
+
 
 def wait_server_ready(endpoints):
     """
@@ -24,7 +26,7 @@ def wait_server_ready(endpoints):
     port readiness.
     
     Args:
-    endpoints (list): endpoints string list, like:
+    endpoints (list|tuple): endpoints string list, like:
     ["127.0.0.1:8080", "127.0.0.1:8081"]
     
     Examples:
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index 62c8faa0757c66..f89d73416960a8 100644
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -22,6 +22,8 @@
 import paddle.fluid as fluid
 from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
 
+__all__ = []
+
 
 class Role:
     WORKER = 1
diff --git a/python/paddle/distributed/fleet/base/runtime_factory.py b/python/paddle/distributed/fleet/base/runtime_factory.py
index 9e612c6d530f14..85ff3e1e69c581 100644
--- a/python/paddle/distributed/fleet/base/runtime_factory.py
+++ b/python/paddle/distributed/fleet/base/runtime_factory.py
@@ -15,6 +15,8 @@
 from ..runtime.parameter_server_runtime import ParameterServerRuntime
 from ..runtime.the_one_ps import TheOnePSRuntime
 
+__all__ = []
+
 
 class RuntimeFactory(object):
     def __init__(self):
diff --git a/python/paddle/distributed/fleet/base/strategy_compiler.py b/python/paddle/distributed/fleet/base/strategy_compiler.py
index 7b146318abe62a..b90e5b2bff7bfa 100644
--- a/python/paddle/distributed/fleet/base/strategy_compiler.py
+++ b/python/paddle/distributed/fleet/base/strategy_compiler.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+__all__ = []
+
 
 def create_graph(optimizer_list):
     nsize = len(optimizer_list)
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
index 4dca860212c7ac..470a4d83aac3fe 100644
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -19,13 +19,23 @@
 import numpy as np
 from itertools import product
 from functools import reduce
+from ..utils.log_util import logger
+
 __all__ = ['CommunicateTopology', 'HybridCommunicateGroup']
 
 _HYBRID_PARALLEL_GROUP = None
 
 
+class ParallelMode(object):
+    DATA_PARALLEL = 0
+    MODEL_PARALLEL = 1
+    PIPELINE_PARALLEL = 2
+
+
 class CommunicateTopology(object):
-    def __init__(self, hybrid_group_names, dims):
+    def __init__(self,
+                 hybrid_group_names=["data", "pipe", "model"],
+                 dims=[1, 1, 1]):
         self._parallel_names = hybrid_group_names
         self._dims = dims
         self.coordinate = collections.namedtuple('Coordinate',
@@ -110,6 +120,7 @@ def __init__(self, topology):
 
         self._data_parallel_id = self._get_data_parallel_id()
         self._model_parallel_id = self._get_model_parallel_id()
+        self.stage_id = self._get_pipe_parallel_id()
 
         assert self._check_vaild_topo(
         ), "Here is an unreasonable topogy setting. world_size: {}, but" \
@@ -118,15 +129,41 @@ def __init__(self, topology):
 
         # create comm group for data parallel
         self._dp_group, self._dp_comm_group = self._set_comm_group("data")
-        print("data parallel group", self._dp_group, file=sys.stderr)
 
         # create comm group for model parallel
         self._mp_group, self._mp_comm_group = self._set_comm_group("model")
-        print("data parallel group", self._mp_group, file=sys.stderr)
+
+        # create comm group for pipe parallel
+        self._pp_group, self._pp_comm_group = self._set_comm_group("pipe")
+
+        # create global group for check inf_nan / clip global norm
+        self._check_group, self._check_comm_group = self._set_check_group(
+            "data")
+
+        # create p2p group
+        self.is_first_stage = (self.stage_id == 0)
+        self.is_last_stage = (self.stage_id == (self._pp_degree - 1))
+
+        debug_str = "HybridParallelInfo: rank_id: %d, dp_degree: %d, " \
+                    "mp_degree: %d, pp_degree: %d" % (self.global_rank, self._dp_degree,
+                    self._mp_degree,self._pp_degree)
+        debug_str += ", dp_group: %s, mp_group: %s, pp_group: %s, check/clip group: %s" % (
+            self._dp_group, self._mp_group, self._pp_group, self._check_group)
+        logger.info(debug_str)
 
         global _HYBRID_PARALLEL_GROUP
         _HYBRID_PARALLEL_GROUP = self
 
+    def get_parallel_mode(self):
+        # there are three modes : DataParallel / ModelParallel / PipelineParallel
+        if self._mp_degree == 1 and self._pp_degree == 1:
+            return ParallelMode.DATA_PARALLEL
+        elif self._mp_degree > 1 and self._pp_degree == 1:
+            # initialize the seed
+            return ParallelMode.MODEL_PARALLEL
+        elif self._pp_degree > 1:
+            return ParallelMode.PIPELINE_PARALLEL
+
     def _check_vaild_topo(self):
         return self._dp_degree * self._mp_degree * self._pp_degree == self.nranks
 
@@ -146,6 +183,22 @@ def _set_comm_group(self, parallel_method="data"):
 
         return parallel_group, parallel_comm_group
 
+    def _set_check_group(self, parallel_method="data"):
+        parallel_group = []
+        parallel_comm_group = None
+        parallel_size = self._topo.get_dim(parallel_method)
+        for idx in range(parallel_size):
+            parallel_groups = self._topo.get_axis_list(parallel_method, idx)
+            comm_group = paddle.distributed.new_group(ranks=parallel_groups)
+            if self.global_rank in parallel_groups:
+                parallel_group = parallel_groups
+                parallel_comm_group = comm_group
+
+        assert len(parallel_group) > 0
+        assert parallel_comm_group is not None
+
+        return parallel_group, parallel_comm_group
+
     def topology(self):
         return self._topo
 
@@ -183,3 +236,20 @@ def get_model_parallel_group(self):
 
     def get_model_parallel_group_src_rank(self):
         return self._mp_comm_group.ranks[0]
+
+    # pipeline parallel message
+    def _get_pipe_parallel_id(self):
+        return self._topo.get_coord(self.global_rank).pipe
+
+    def get_stage_id(self):
+        return self.stage_id
+
+    def get_pipe_parallel_world_size(self):
+        return self._pp_degree
+
+    def get_pipe_parallel_group(self):
+        return self._pp_comm_group
+
+    # check parallel group
+    def get_check_parallel_group(self):
+        return self._check_comm_group
diff --git a/python/paddle/distributed/fleet/base/util_factory.py b/python/paddle/distributed/fleet/base/util_factory.py
index d982f14eaa5af1..de101cd74c4e83 100644
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
@@ -27,7 +27,8 @@
 import subprocess
 import os
 import numpy as np
-__all__ = ['UtilBase']
+
+__all__ = []
 
 
 class UtilFactory(object):
diff --git a/python/paddle/distributed/fleet/cloud_utils.py b/python/paddle/distributed/fleet/cloud_utils.py
index f5a24cf48ca06d..0b1169e4422637 100644
--- a/python/paddle/distributed/fleet/cloud_utils.py
+++ b/python/paddle/distributed/fleet/cloud_utils.py
@@ -16,6 +16,8 @@
 import paddle
 from paddle.distributed.fleet.launch_utils import get_cluster, logger
 
+__all__ = []
+
 
 def get_cloud_cluster(args_node_ips,
                       device_mode,
diff --git a/python/paddle/distributed/fleet/data_generator/__init__.py b/python/paddle/distributed/fleet/data_generator/__init__.py
index 481df4064a4ecc..230ada2abec062 100644
--- a/python/paddle/distributed/fleet/data_generator/__init__.py
+++ b/python/paddle/distributed/fleet/data_generator/__init__.py
@@ -11,4 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 
-from .data_generator import *
+from .data_generator import DataGenerator  # noqa: F401
+
+__all__ = []
diff --git a/python/paddle/distributed/fleet/data_generator/data_generator.py b/python/paddle/distributed/fleet/data_generator/data_generator.py
index 9d743fc38bf398..cceb81838c1d2a 100644
--- a/python/paddle/distributed/fleet/data_generator/data_generator.py
+++ b/python/paddle/distributed/fleet/data_generator/data_generator.py
@@ -15,6 +15,8 @@
 import os
 import sys
 
+__all__ = []
+
 
 class DataGenerator(object):
     """
diff --git a/python/paddle/distributed/fleet/dataset/__init__.py b/python/paddle/distributed/fleet/dataset/__init__.py
index 24b68596f25419..55b944abccd51c 100644
--- a/python/paddle/distributed/fleet/dataset/__init__.py
+++ b/python/paddle/distributed/fleet/dataset/__init__.py
@@ -11,5 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 
-from .dataset import *
-from .index_dataset import *
+from .dataset import DatasetBase  # noqa: F401
+from .dataset import InMemoryDataset  # noqa: F401
+from .dataset import QueueDataset  # noqa: F401
+from .dataset import FileInstantDataset  # noqa: F401
+from .dataset import BoxPSDataset  # noqa: F401
+from .index_dataset import TreeIndex  # noqa: F401
+
+__all__ = []
diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py
index 10c27ea91d2494..2f428346b9c0c5 100644
--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
@@ -18,6 +18,8 @@
 from google.protobuf import text_format
 import paddle.fluid.core as core
 
+__all__ = []
+
 
 class DatasetBase(object):
     """ Base dataset class. """
diff --git a/python/paddle/distributed/fleet/dataset/index_dataset.py b/python/paddle/distributed/fleet/dataset/index_dataset.py
index dfd3daa9570b95..c4c424fe2dc7e6 100644
--- a/python/paddle/distributed/fleet/dataset/index_dataset.py
+++ b/python/paddle/distributed/fleet/dataset/index_dataset.py
@@ -13,6 +13,8 @@
 # limitations under the License.
 from paddle.fluid import core
 
+__all__ = []
+
 
 class Index(object):
     def __init__(self, name):
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index 13b793d3ad1701..25b10133191788 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -75,6 +75,8 @@
 import paddle.distributed.fleet.cloud_utils as cloud_utils
 import paddle.distributed.fleet.ascend_utils as ascend_utils
 
+__all__ = []
+
 
 def _print_arguments(args):
     print("-----------  Configuration Arguments -----------")
@@ -112,7 +114,7 @@ def _parse_args():
     base_group.add_argument(
         "--run_mode",
         type=str,
-        default="collective",
+        default=None,
         help="run mode of job, can be:collective/ps/ps-heter")
 
     if fluid.core.is_compiled_with_cuda():
@@ -325,8 +327,8 @@ def which_distributed_mode(args):
 
     if fluid.core.is_compiled_with_cuda():
         accelerators = fluid.core.get_cuda_device_count()
-    elif fluid.core.is_compiled_with_ascend():
-        accelerators = fluid.core.NPUDevice.get_device_count()
+    elif fluid.core.is_compiled_with_npu():
+        accelerators = fluid.core.get_npu_device_count()
     elif fluid.core.is_compiled_with_xpu():
         accelerators = fluid.core.get_xpu_device_count()
     else:
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index b4d5c58abbf2e5..be7ad257ccb99c 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -653,8 +653,8 @@ def get_xpus(xpus):
 
 
 def get_device_mode():
-    if fluid.core.is_compiled_with_ascend() and \
-            fluid.core.NPUDevice.get_device_count() > 0:
+    if fluid.core.is_compiled_with_npu() and \
+            fluid.core.get_npu_device_count() > 0:
         print("launch train in ascend npu mode!")
         return DeviceMode.ASCEND_NPU
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index cdc8162f6dee54..827835fde20e3e 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -25,3 +25,6 @@
 from .lamb_optimizer import LambOptimizer
 from .fp16_allreduce_optimizer import FP16AllReduceOptimizer
 from .sharding_optimizer import ShardingOptimizer
+from .dygraph_optimizer import HybridParallelOptimizer
+from .dygraph_optimizer import HybridParallelGradScaler
+from .tensor_parallel_optimizer import TensorParallelOptimizer
diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
index 02505e01197dc6..9ffb47789ee987 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py
@@ -14,6 +14,8 @@
 import paddle.fluid.contrib.mixed_precision as mixed_precision
 from .meta_optimizer_base import MetaOptimizerBase
 
+__all__ = []
+
 
 class AMPOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
index 824225fd776d13..6282ac7b509838 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py
@@ -24,6 +24,8 @@
 
 HcomGroupConfig = namedtuple('HcomGroupConfig', ['name', 'nranks', 'rank_ids'])
 
+__all__ = []
+
 
 class AscendIRParser(object):
     def __init__(self, auto_dp=False, world_rank_size=1):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
index 19b5e910db2993..3331a45b3d9479 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
@@ -18,6 +18,8 @@
 from paddle.distributed import fleet
 from functools import reduce
 
+__all__ = []
+
 registerd_op = {## forwards
                 "elementwise_add": "AddParser",
                 "matmul": "MatMulParser",
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index a7f938647ad719..707284a784c38e 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -13,11 +13,14 @@
 # limitations under the License.
 
 from __future__ import print_function
+import os
 
 import paddle.fluid as fluid
 from paddle.fluid import core, unique_name
 from ..base.private_helper_function import wait_server_ready
 
+__all__ = []
+
 OpRole = core.op_proto_and_checker_maker.OpRole
 
 OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
@@ -77,6 +80,7 @@ def _init_communicator(self,
         nranks = len(endpoints)
         other_endpoints = endpoints[:]
         other_endpoints.remove(current_endpoint)
+
         if rank == 0 and wait_port:
             wait_server_ready(other_endpoints)
 
@@ -163,6 +167,33 @@ def _add_sync_by_allreduce(block):
                     'ring_id': ring_id,
                     OP_ROLE_KEY: OpRole.Forward
                 })
+        elif core.is_compiled_with_npu():
+            hccl_id_var = block.create_var(
+                name=unique_name.generate('hccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
+            block.append_op(
+                type='c_gen_hccl_id',
+                inputs={},
+                outputs={'Out': hccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init_hccl',
+                inputs={'X': hccl_id_var},
+                outputs={},
+                attrs={
+                    'rank': rank,
+                    'ring_id': ring_id,
+                    'device_id': int(os.getenv("FLAGS_selected_npus")),
+                    'rank_ids': nranks,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
         else:
             raise ValueError(
                 "comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu."
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index 7bd68325569334..b035f179317ac4 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -15,6 +15,8 @@
 from .meta_optimizer_base import MetaOptimizerBase
 import logging
 
+__all__ = []
+
 
 class DGCOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
new file mode 100644
index 00000000000000..f0f26bd2e0d060
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
@@ -0,0 +1,16 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+from .hybrid_parallel_optimizer import HybridParallelOptimizer
+from .hybrid_parallel_gradscaler import HybridParallelGradScaler
+
+__all__ = []
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
new file mode 100644
index 00000000000000..d0e8034f5cae15
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+from paddle.optimizer import Optimizer
+from ...base.topology import ParallelMode
+from paddle.fluid.dygraph import base as imperative_base
+from paddle.fluid import framework
+from paddle.fluid.framework import Variable
+import types
+from paddle.fluid import core
+import paddle
+
+__all__ = []
+
+
+class HybridParallelGradScaler:
+    def __init__(self, scaler, hcg):
+        self._scaler = scaler
+        self._hcg = hcg
+        self._is_mp = (
+            self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL)
+
+    def scale(self, var):
+        return self._scaler.scale(var)
+
+    def minimize(self, optimizer, *args, **kwargs):
+        if not self._enable:
+            return optimizer.minimize(*args, **kwargs)
+
+        #  unscale the grad
+        self._unscale(optimizer)
+
+        optimize_ops, params_grads = (None, None)
+
+        if self._found_inf:
+            self._cache_founf_inf = True
+        else:
+            optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
+            self._cache_founf_inf = False
+
+        if self._use_dynamic_loss_scaling:
+            self._update()
+
+        return optimize_ops, params_grads
+
+    @imperative_base.no_grad
+    def _unscale(self, optimizer):
+        if not self._enable:
+            return
+        param_grads = [
+            param._grad_ivar() for param in optimizer._parameter_list
+            if param._grad_ivar() is not None
+        ]
+        core.ops.check_finite_and_unscale(param_grads, self._scale, param_grads,
+                                          self._found_inf)
+        # allreduce_max found_inf in check_group
+        if self._is_mp:
+            self._found_inf = paddle.cast(self._found_inf, dtype="int32")
+            # TODO(shenliang03) Since the minimize call in the optimizer is 
+            # after the gradscaler, check_finite needs to synchronize global 
+            # information. In the future, we should use check_group
+            paddle.distributed.all_reduce(
+                self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None)
+            self._found_inf = paddle.cast(self._found_inf, dtype="bool")
+
+    def __getattr__(self, item):
+        return getattr(self._scaler, item)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
new file mode 100644
index 00000000000000..b7ac298d2223ee
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+from paddle.optimizer import Optimizer
+from paddle.fluid.clip import ClipGradByGlobalNorm
+from ...utils.hybrid_parallel_util import fused_allreduce_gradients
+from ...base.topology import ParallelMode
+from paddle.fluid.dygraph import base as imperative_base
+from paddle.fluid import framework
+from paddle.fluid.framework import Variable
+from ...utils.log_util import logger
+
+__all__ = []
+
+
+class HybridParallelClipGrad:
+    def __init__(self, clip, hcg):
+        self._clip = clip
+        self._hcg = hcg
+
+    @imperative_base.no_grad
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        sum_square_list = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                continue
+            merge_grad = g
+            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                merge_grad = layers.merge_selected_rows(g)
+                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
+            square = layers.square(merge_grad)
+            sum_square = layers.reduce_sum(square)
+            sum_square_list.append(sum_square)
+
+        # all parameters have been filterd out
+        if len(sum_square_list) == 0:
+            return params_grads
+
+        global_norm_var = layers.concat(sum_square_list)
+        global_norm_var = layers.reduce_sum(global_norm_var)
+        # add all reduce to get global norm in world size
+        paddle.distributed.all_reduce(global_norm_var,
+                                      self._hcg.get_check_parallel_group())
+        global_norm_var = layers.sqrt(global_norm_var)
+
+        max_global_norm = layers.fill_constant(
+            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
+        clip_var = layers.elementwise_div(
+            x=max_global_norm,
+            y=layers.elementwise_max(
+                x=global_norm_var, y=max_global_norm))
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            new_grad = layers.elementwise_mul(x=g, y=clip_var)
+            params_and_grads.append((p, new_grad))
+
+        return params_and_grads
+
+    def __getattr__(self, item):
+        return getattr(self._clip, item)
+
+    def __call__(self, params_grads):
+        return self._clip(params_grads)
+
+
+class HybridParallelOptimizer:
+    # adapter wrapper for optimizer
+    def __init__(self, optimizer, hcg, strategy):
+        self._inner_opt = optimizer
+        self._strategy = strategy
+        self._hcg = hcg
+        self._is_mp = (
+            self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL)
+        self._need_dp = (self._hcg.get_data_parallel_world_size() > 1)
+
+        if isinstance(self._inner_opt._grad_clip,
+                      ClipGradByGlobalNorm) and self._is_mp:
+            logger.warning("using ClipGradByGlobalNorm in ModelParallel, the origin " \
+                  "optmizer'grad clip will be changed.")
+            self._inner_opt._grad_clip = HybridParallelClipGrad(
+                self._inner_opt._grad_clip, hcg)
+
+    @imperative_base.no_grad
+    @framework.dygraph_only
+    def step(self):
+        if self._is_mp and self._need_dp:
+            fused_allreduce_gradients(
+                list(self._inner_opt._parameter_list), self._hcg)
+        self._inner_opt.step()
+
+    @imperative_base.no_grad
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameters=None,
+                 no_grad_set=None):
+        assert isinstance(loss, Variable), "The loss should be an Tensor."
+
+        parameter_list = parameters if parameters \
+            else self._parameter_list
+
+        if self._is_mp and self._need_dp:
+            fused_allreduce_gradients(list(parameter_list), self._hcg)
+
+        return self._inner_opt.minimize(loss, startup_program, parameters,
+                                        no_grad_set)
+
+    def __getattr__(self, item):
+        return getattr(self._inner_opt, item)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
index 411980ed01322a..f636a313757854 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py
@@ -14,6 +14,8 @@
 from paddle.fluid import core, framework, unique_name
 from .meta_optimizer_base import MetaOptimizerBase
 
+__all__ = []
+
 
 class FP16AllReduceOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
index 380fbc2e09ebff..949ef3e5f3a78f 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py
@@ -14,6 +14,8 @@
 from paddle.fluid.optimizer import GradientMergeOptimizer as GM
 from .meta_optimizer_base import MetaOptimizerBase
 
+__all__ = []
+
 
 class GradientMergeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
index 9a4ffd2fd02d4a..4194cf13d2bbcd 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
@@ -19,6 +19,8 @@
 from ..base.private_helper_function import wait_server_ready
 import logging
 
+__all__ = []
+
 
 class GraphExecutionOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
index 64d54ae3bab03b..6d2474d9352f87 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py
@@ -16,6 +16,8 @@
 from .meta_optimizer_base import MetaOptimizerBase
 import logging
 
+__all__ = []
+
 
 class LambOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
index 32c6be505a5467..e1bf3722c191d1 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py
@@ -15,6 +15,8 @@
 from .meta_optimizer_base import MetaOptimizerBase
 import logging
 
+__all__ = []
+
 
 class LarsOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
index 91030f07629343..3340672e0f925b 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -19,6 +19,8 @@
 from .meta_optimizer_base import MetaOptimizerBase
 from .common import OpRole, OP_ROLE_KEY, CollectiveHelper, is_update_op
 
+__all__ = []
+
 
 class LocalSGDOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
index a12ca50442b1c3..3bbaa055c5e597 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py
@@ -14,6 +14,8 @@
 
 from paddle.fluid.optimizer import Optimizer
 
+__all__ = []
+
 
 class MetaOptimizerBase(Optimizer):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
index dfa765364f357b..ba2a0e84c7ab6b 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
@@ -15,6 +15,8 @@
 from paddle.fluid import compiler
 from .parameter_server_optimizer import ParameterServerOptimizer
 
+__all__ = []
+
 
 class ParameterServerGraphOptimizer(ParameterServerOptimizer):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index f6d2af0b416d2d..88180221ff4ff5 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -20,6 +20,8 @@
 import platform
 from ..base.private_helper_function import wait_server_ready
 
+__all__ = []
+
 
 class ParameterServerOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index ae2daa9b9d8592..a0bf4cc5bc0975 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -13,6 +13,7 @@
 
 from __future__ import print_function
 from __future__ import division
+import os
 
 import paddle.fluid as fluid
 from paddle.fluid import core, unique_name
@@ -21,6 +22,8 @@
 from .meta_optimizer_base import MetaOptimizerBase
 from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_loss_grad_op, is_backward_op, is_optimizer_op
 
+__all__ = []
+
 
 class PipelineOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
index 3a784c306257b2..d79675448c0425 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py
@@ -14,6 +14,8 @@
 from paddle.fluid.optimizer import RecomputeOptimizer as RO
 from .meta_optimizer_base import MetaOptimizerBase
 
+__all__ = []
+
 
 class RecomputeOptimizer(MetaOptimizerBase):
     def __init__(self, optimizer):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
index 40ba77815663f0..8e636353729845 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
@@ -17,6 +17,8 @@
 
 from paddle.fluid import core
 
+__all__ = []
+
 
 class FP16Utils(object):
     def __init__(self):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
index d5a012b147a99e..fd74f28b69e190 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
@@ -14,6 +14,8 @@
 
 from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
 
+__all__ = []
+
 
 class GradientClipHelper(object):
     def __init__(self, mp_ring_id):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
index 76803818453c92..f6741b165ce072 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
@@ -15,6 +15,8 @@
 from ..common import is_optimizer_op, OP_ROLE_KEY, OpRole
 from paddle.fluid import core, unique_name
 
+__all__ = []
+
 
 class OffloadHelper(object):
     cpu_place_type = 0
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
index 5a43367cf1ad12..dd4e16b576fcf0 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+__all__ = []
+
 
 class ProgramDeps(object):
     def __init__(self, block, start_vars, end_vars):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
index 92e36e0ec1fff3..0c33a78120cb84 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
@@ -16,6 +16,8 @@
 from paddle.distributed.fleet.meta_optimizers.sharding.utils import *
 from paddle.distributed.fleet.meta_optimizers.sharding.fp16_helper import FP16Utils
 
+__all__ = []
+
 
 class Shard(object):
     def __init__(self, ):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
index 2833e8c6dac4be..ab0c79bca554c6 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
@@ -14,6 +14,8 @@
 
 from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_VAR_KEY
 
+__all__ = []
+
 
 class WeightDecayHelper(object):
     def __init__(self):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 2c4ad33c361e01..82e54a89e104ff 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -29,12 +29,15 @@
 from paddle.fluid import layers
 
 import logging
-logging.basicConfig(
-    format='%(asctime)s %(levelname)-8s %(message)s',
-    datefmt='%Y-%m-%d %H:%M:%S')
+logger = logging.getLogger(__name__)
+formatter = logging.Formatter(
+    fmt='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
+ch = logging.StreamHandler()
+ch.setFormatter(formatter)
+logger.addHandler(ch)
 from functools import reduce
 
-__all__ = ["ShardingOptimizer"]
+__all__ = []
 
 
 class ShardingOptimizer(MetaOptimizerBase):
@@ -136,7 +139,7 @@ def minimize_impl(self,
 
         # FIXME (JZ-LIANG) deprecated hybrid_dp
         if self.user_defined_strategy.sharding_configs["hybrid_dp"]:
-            logging.warning(
+            logger.warning(
                 "[hybrid_dp] API setting is deprecated. Now when dp_degree >= 2, its will be in hybrid dp mode automatically"
             )
             assert self.dp_degree >= 1
@@ -174,7 +177,7 @@ def minimize_impl(self,
             self._gradient_merge_acc_step = self.user_defined_strategy.pipeline_configs[
                 'accumulate_steps']
         if self._gradient_merge_acc_step > 1:
-            logging.info("Gradient merge in [{}], acc step = [{}]".format(
+            logger.info("Gradient merge in [{}], acc step = [{}]".format(
                 self.gradient_merge_mode, self._gradient_merge_acc_step))
 
         # optimize offload
@@ -338,7 +341,7 @@ def minimize_impl(self,
         # opt offload should be enable while gradient merge is enable && acc_step is quite large (e.g. >> 100) 
         # sync its memcpy could not be overlap with calc, otherwise it will slower down training severely. 
         if self.optimize_offload:
-            logging.info("Sharding with optimize offload !")
+            logger.info("Sharding with optimize offload !")
             offload_helper = OffloadHelper()
             offload_helper.offload(main_block, startup_block)
             offload_helper.offload_fp32param(main_block, startup_block)
@@ -365,8 +368,8 @@ def minimize_impl(self,
                   'w') as f:
             f.writelines(str(main_block.program))
 
-        self._wait()
-
+        if core.is_compiled_with_cuda():
+            self._wait()
         return optimize_ops, params_grads
 
     def _init_comm(self):
@@ -641,15 +644,15 @@ def _split_program(self, block):
             for varname in sorted(
                     var2broadcast_time, key=var2broadcast_time.get,
                     reverse=True):
-                logging.info("Sharding broadcast: [{}] times [{}]".format(
+                logger.info("Sharding broadcast: [{}] times [{}]".format(
                     var2broadcast_time[varname], varname))
             for idx_ in range(len(self._segments)):
-                logging.info("segment [{}] :".format(idx_))
-                logging.info("start op: [{}]  [{}]".format(block.ops[
+                logger.info("segment [{}] :".format(idx_))
+                logger.info("start op: [{}]  [{}]".format(block.ops[
                     self._segments[idx_]._start_idx].desc.type(), block.ops[
                         self._segments[idx_]._start_idx].desc.input_arg_names(
                         )))
-                logging.info("end   op: [{}]  [{}]".format(block.ops[
+                logger.info("end   op: [{}]  [{}]".format(block.ops[
                     self._segments[idx_]._end_idx].desc.type(), block.ops[
                         self._segments[idx_]._end_idx].desc.input_arg_names()))
         return
@@ -1108,7 +1111,7 @@ def _build_groups(self):
                 self.dp_group_endpoints.append(self.global_endpoints[
                     dp_first_rank_idx + dp_offset * i])
             assert self.current_endpoint in self.dp_group_endpoints
-            logging.info("Hybrid DP mode turn on !")
+            logger.info("Hybrid DP mode turn on !")
         else:
             self.dp_ring_id = -1
             self.dp_rank = -1
@@ -1119,40 +1122,40 @@ def _build_groups(self):
         # NOTE (JZ-LIANG) when use global ring for calc global norm and dp_degree > 1, the allreduce result should be devided by dp_degree
         self.global_ring_id = 3
 
-        logging.info("global word size: {}".format(self.global_word_size))
-        logging.info("global rank: {}".format(self.global_rank))
-        logging.info("global endpoints: {}".format(self.global_endpoints))
-        logging.info("global ring id: {}".format(self.global_ring_id))
-        logging.info("#####" * 6)
-
-        logging.info("mp group size: {}".format(self.mp_degree))
-        logging.info("mp rank: {}".format(self.mp_rank))
-        logging.info("mp group id: {}".format(self.mp_group_id))
-        logging.info("mp group endpoints: {}".format(self.mp_group_endpoints))
-        logging.info("mp ring id: {}".format(self.mp_ring_id))
-        logging.info("#####" * 6)
-
-        logging.info("sharding group size: {}".format(self.sharding_degree))
-        logging.info("sharding rank: {}".format(self.sharding_rank))
-        logging.info("sharding group id: {}".format(self.sharding_group_id))
-        logging.info("sharding group endpoints: {}".format(
+        logger.info("global word size: {}".format(self.global_word_size))
+        logger.info("global rank: {}".format(self.global_rank))
+        logger.info("global endpoints: {}".format(self.global_endpoints))
+        logger.info("global ring id: {}".format(self.global_ring_id))
+        logger.info("#####" * 6)
+
+        logger.info("mp group size: {}".format(self.mp_degree))
+        logger.info("mp rank: {}".format(self.mp_rank))
+        logger.info("mp group id: {}".format(self.mp_group_id))
+        logger.info("mp group endpoints: {}".format(self.mp_group_endpoints))
+        logger.info("mp ring id: {}".format(self.mp_ring_id))
+        logger.info("#####" * 6)
+
+        logger.info("sharding group size: {}".format(self.sharding_degree))
+        logger.info("sharding rank: {}".format(self.sharding_rank))
+        logger.info("sharding group id: {}".format(self.sharding_group_id))
+        logger.info("sharding group endpoints: {}".format(
             self.sharding_group_endpoints))
-        logging.info("sharding ring id: {}".format(self.sharding_ring_id))
-        logging.info("#####" * 6)
-
-        logging.info("pp group size: {}".format(self.pp_degree))
-        logging.info("pp rank: {}".format(self.pp_rank))
-        logging.info("pp group id: {}".format(self.pp_group_id))
-        logging.info("pp group endpoints: {}".format(self.pp_group_endpoints))
-        logging.info("pp ring id: {}".format(self.pp_ring_id))
-        logging.info("#####" * 6)
-
-        logging.info("pure dp group size: {}".format(self.dp_degree))
-        logging.info("pure dp rank: {}".format(self.dp_rank))
-        logging.info("pure dp group endpoints: {}".format(
+        logger.info("sharding ring id: {}".format(self.sharding_ring_id))
+        logger.info("#####" * 6)
+
+        logger.info("pp group size: {}".format(self.pp_degree))
+        logger.info("pp rank: {}".format(self.pp_rank))
+        logger.info("pp group id: {}".format(self.pp_group_id))
+        logger.info("pp group endpoints: {}".format(self.pp_group_endpoints))
+        logger.info("pp ring id: {}".format(self.pp_ring_id))
+        logger.info("#####" * 6)
+
+        logger.info("pure dp group size: {}".format(self.dp_degree))
+        logger.info("pure dp rank: {}".format(self.dp_rank))
+        logger.info("pure dp group endpoints: {}".format(
             self.dp_group_endpoints))
-        logging.info("pure dp ring id: {}".format(self.dp_ring_id))
-        logging.info("#####" * 6)
+        logger.info("pure dp ring id: {}".format(self.dp_ring_id))
+        logger.info("#####" * 6)
 
         return
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py
new file mode 100644
index 00000000000000..5fbec7da0b5edf
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py
@@ -0,0 +1,233 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from __future__ import print_function
+from __future__ import division
+
+import paddle.fluid as fluid
+from paddle.fluid import core, unique_name
+from .meta_optimizer_base import MetaOptimizerBase
+from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op
+
+__all__ = []
+
+
+class TensorParallelOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(TensorParallelOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        self.meta_optimizers_white_list = [
+            "RecomputeOptimizer",
+            "AMPOptimizer",
+            "LarsOptimizer",
+            "LambOptimizer",
+        ]
+        self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
+        self.mp_ring_id = 0
+        self.global_ring_id = 1
+        self.dp_ring_id = 2
+
+    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
+                        user_defined_strategy):
+        super(TensorParallelOptimizer, self)._set_basic_info(
+            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        self.mp_degree = user_defined_strategy.tensor_parallel_configs[
+            'tensor_parallel_degree']
+
+    def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
+        if self.user_defined_strategy.tensor_parallel == True:
+            return True
+        return False
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.tensor_parallel = False
+        dist_strategy.tensor_parallel_configs = {}
+
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.tensor_parallel = True
+        dist_strategy.tensor_parallel_configs = {"tensor_parallel_degree": 1, }
+
+    def _broadcast_params(self, ring_id, mp_mode):
+        block = self.startup_program.global_block()
+        param = None
+        for param in block.iter_parameters():
+            if param.is_distributed and mp_mode:
+                continue
+
+            block.append_op(
+                type='c_broadcast',
+                inputs={'X': param},
+                outputs={'Out': param},
+                attrs={
+                    'ring_id': ring_id,
+                    'root': 0,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+
+        if not param: return  # no parameter on this device
+        block.append_op(
+            type='c_sync_comm_stream',
+            inputs={'X': param},
+            outputs={'Out': param},
+            attrs={'ring_id': ring_id,
+                   OP_ROLE_KEY: OpRole.Forward})
+
+    def _get_process_group_info(self):
+        # global ring info
+        self.global_endpoints = self.endpoints
+        self.global_rank = self.rank
+        self.global_nranks = self.nranks
+
+        # model parallel ring info
+        self.mp_rank = self.rank % self.mp_degree
+        self.mp_nranks = self.mp_degree
+        mp_group = self.rank // self.mp_degree
+        self.mp_endpoints = [
+            self.endpoints[i] for i in range(self.global_nranks)
+            if i // self.mp_degree == mp_group
+        ]
+
+        # data parallel ring info
+        if self.nranks > self.mp_degree:
+            self.dp_rank = self.rank // self.mp_degree
+            self.dp_nranks = self.nranks // self.mp_degree
+            start_index = self.rank % self.mp_degree
+            self.dp_endpoints = [
+                self.endpoints[start_index + i * self.mp_degree]
+                for i in range(self.dp_nranks)
+            ]
+
+    def _init_process_group(self):
+        self._get_process_group_info()
+        collective_helper = CollectiveHelper(self.role_maker, wait_port=False)
+
+        # Create global ring for all gpus
+        collective_helper._init_communicator(
+            self.startup_program, self.current_endpoint, self.global_endpoints,
+            self.global_rank, self.global_ring_id, True, self.global_ring_id,
+            True)
+
+        # Create model parallel ring for all gpus
+        collective_helper._init_communicator(
+            self.startup_program, self.current_endpoint, self.mp_endpoints,
+            self.mp_rank, self.mp_ring_id, True, self.global_ring_id, True)
+        #self._broadcast_params(self.mp_ring_id, mp_mode=True)
+
+        # Create dp rings
+        if self.nranks > self.mp_degree:
+            collective_helper._init_communicator(
+                self.startup_program, self.current_endpoint, self.dp_endpoints,
+                self.dp_rank, self.dp_ring_id, True, self.global_ring_id, True)
+            self._broadcast_params(self.dp_ring_id, mp_mode=False)
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        self.endpoints = self.role_maker._get_trainer_endpoints()
+        self.current_endpoint = self.endpoints[self.role_maker._worker_index()]
+        self.startup_program = startup_program
+        if startup_program is None:
+            self.startup_program = fluid.default_startup_program()
+
+        optimize_ops, params_grads = self.inner_opt.minimize(
+            loss, self.startup_program, parameter_list, no_grad_set)
+
+        self.main_program = loss.block.program
+        self.nranks = len(self.endpoints)
+        self.rank = self.role_maker._worker_index()
+
+        self._init_process_group()
+
+        assert self.nranks % self.mp_degree == 0
+
+        if self.nranks > self.mp_degree:
+            # data parallelism
+            dp_degree = self.nranks // self.mp_degree
+            self._transpile_main_program(loss, dp_degree)
+        return optimize_ops, params_grads
+
+    def _transpile_main_program(self, loss, dp_degree):
+        self._insert_loss_grad_ops(loss, dp_degree)
+        self._insert_allreduce_ops(loss, self.dp_ring_id)
+
+    def _insert_loss_grad_ops(self, loss, dp_degree):
+        """
+        In order to keep the learning rate consistent in different numbers of
+        training workers, we scale the loss grad by the number of workers
+        """
+        block = loss.block
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_loss_grad_op(op):
+                loss_grad_var = block.vars[op.output_arg_names[0]]
+                block._insert_op(
+                    idx + 1,
+                    type='scale',
+                    inputs={'X': loss_grad_var},
+                    outputs={'Out': loss_grad_var},
+                    attrs={
+                        'scale': 1.0 / dp_degree,
+                        OP_ROLE_KEY: OpRole.Backward
+                    })
+                break
+
+    def _insert_allreduce_ops(self, loss, ring_id):
+        block = loss.block
+        grad = None
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_backward_op(op) and OP_ROLE_VAR_KEY in op.attr_names:
+                op_role_var = op.attr(OP_ROLE_VAR_KEY)
+                if len(op_role_var) == 0:
+                    continue
+                assert len(op_role_var) % 2 == 0
+                offset = idx
+                for i in range(0, len(op_role_var), 2):
+                    param = block.vars[op_role_var[i]]
+                    grad = block.vars[op_role_var[i + 1]]
+                    if offset == idx:
+                        offset += 1
+                        block._insert_op(
+                            offset,
+                            type='c_sync_calc_stream',
+                            inputs={'X': grad},
+                            outputs={'Out': grad},
+                            attrs={OP_ROLE_KEY: OpRole.Backward})
+                        offset += 1
+
+                    block._insert_op(
+                        offset,
+                        type='c_allreduce_sum',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Backward
+                        })
+
+        if grad is None:
+            return
+
+        for idx, op in list(enumerate(block.ops)):
+            if is_optimizer_op(op):
+                block._insert_op(
+                    idx,
+                    type='c_sync_comm_stream',
+                    inputs={'X': grad},
+                    outputs={'Out': grad},
+                    attrs={'ring_id': ring_id,
+                           OP_ROLE_KEY: OpRole.Backward})
+                break
diff --git a/python/paddle/distributed/fleet/meta_parallel/__init__.py b/python/paddle/distributed/fleet/meta_parallel/__init__.py
index 977954fc257dc6..ed74d8e744e50d 100644
--- a/python/paddle/distributed/fleet/meta_parallel/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/__init__.py
@@ -12,4 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .mp_utils import *
+from .parallel_layers import VocabParallelEmbedding  # noqa: F401
+from .parallel_layers import ColumnParallelLinear  # noqa: F401
+from .parallel_layers import RowParallelLinear  # noqa: F401
+from .parallel_layers import LayerDesc  # noqa: F401
+from .parallel_layers import PipelineLayer  # noqa: F401
+from .parallel_layers import RNGStatesTracker  # noqa: F401
+from .parallel_layers import model_parallel_random_seed  # noqa: F401
+from .parallel_layers import get_rng_state_tracker  # noqa: F401
+from .model_parallel import ModelParallel  # noqa: F401
+from .pipeline_parallel import PipelineParallel  # noqa: F401
+
+__all__ = []
diff --git a/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py b/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py
new file mode 100644
index 00000000000000..69e41ab0edab2d
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.dygraph.layers import Layer
+
+__all__ = []
+
+
+class MetaParallelBase(Layer):
+    def __init__(self, layers, hcg, strategy):
+        super(MetaParallelBase,
+              self).__init__(layers.full_name() + "_meta_parallel_base")
+        self._layers = layers
+        self._hcg = hcg
+        self._strategy = strategy
+        self._prepare_for_model()
+
+    def _prepare_for_model(self):
+        pass
+
+    def _pre_forward(self, *inputs, **kwargs):
+        pass
+
+    def forward(self, *inputs, **kwargs):
+        self._pre_forward(*inputs, **kwargs)
+
+        output = self._layers(*inputs, **kwargs)
+
+        self._post_forward(output)
+
+        return output
+
+    def _post_forward(self, output):
+        pass
diff --git a/python/paddle/distributed/fleet/meta_parallel/model_parallel.py b/python/paddle/distributed/fleet/meta_parallel/model_parallel.py
new file mode 100644
index 00000000000000..682d7152a42bd2
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/model_parallel.py
@@ -0,0 +1,40 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.dygraph.layers import Layer
+from .meta_parallel_base import MetaParallelBase
+from ..utils.hybrid_parallel_util import broadcast_dp_parameters
+from ..utils.hybrid_parallel_util import broadcast_input_data
+from ..utils.hybrid_parallel_util import broadcast_mp_parameters
+from ..utils.log_util import logger
+
+__all__ = []
+
+
+class ModelParallel(MetaParallelBase):
+    def __init__(self, layers, hcg, **kwargs):
+        super(ModelParallel, self).__init__(layers, hcg, **kwargs)
+
+    def _prepare_for_model(self):
+        logger.info("start broadcast mp parameters")
+        broadcast_mp_parameters(self._layers, self._hcg)
+
+        logger.info("start broadcast mp parameters")
+        broadcast_dp_parameters(self._layers, self._hcg)
+
+        logger.info("mp's parameters is ready")
+
+    def _pre_forward(self, *inputs, **kwargs):
+        logger.debug("mp start broadcast input data")
+        return broadcast_input_data(self._hcg, *inputs, **kwargs)
diff --git a/python/paddle/distributed/fleet/meta_parallel/mp_utils/layers_help.py b/python/paddle/distributed/fleet/meta_parallel/mp_utils/layers_help.py
deleted file mode 100644
index e32db686efd444..00000000000000
--- a/python/paddle/distributed/fleet/meta_parallel/mp_utils/layers_help.py
+++ /dev/null
@@ -1,116 +0,0 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.autograd import PyLayer
-from ...base import topology as tp
-import paddle
-
-# Follow this paper to achieve the file:
-# Shoeybi M, Patwary M, Puri R, et al. Megatron-lm: Training multi-billion parameter 
-# language models using model parallelism[J]. arXiv preprint arXiv:1909.08053, 2019. (https://arxiv.org/abs/1909.08053)
-
-
-def mp_reduce(x):
-    if tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size() == 1:
-        return x
-
-    paddle.distributed.all_reduce(
-        x, group=tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group())
-
-    return x
-
-
-def mp_split(x):
-    world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size()
-
-    if world_size == 1:
-        return x
-
-    rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank()
-    last_dim = len(x.shape) - 1
-    input_list = paddle.split(x, num_or_sections=world_size, axis=last_dim)
-    output = input_list[rank]
-
-    return output
-
-
-def mp_gather(x):
-    world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size()
-
-    if world_size == 1:
-        return x
-
-    output = []
-    paddle.distributed.all_gather(
-        output, x, group=tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group())
-
-    output = paddle.concat(output, axis=len(x.shape) - 1)
-
-    return output
-
-
-class _IdentityInModelParallel(PyLayer):
-    @staticmethod
-    def forward(ctx, x):
-        return x
-
-    @staticmethod
-    def backward(ctx, dx):
-        return mp_reduce(dx)
-
-
-class _ReduceInModelParallel(PyLayer):
-    @staticmethod
-    def forward(ctx, x):
-        return mp_reduce(x)
-
-    @staticmethod
-    def backward(ctx, dx):
-        return dx
-
-
-class _ScatterInModelParallel(PyLayer):
-    @staticmethod
-    def forward(ctx, x):
-        return mp_split(x)
-
-    @staticmethod
-    def backward(ctx, dx):
-        return mp_gather(dx)
-
-
-class _GatherInModelParallel(PyLayer):
-    @staticmethod
-    def forward(ctx, x):
-        return mp_gather(x)
-
-    @staticmethod
-    def backward(ctx, dx):
-        return mp_split(dx)
-
-
-def identity_in_model_parallel(x):
-    return _IdentityInModelParallel.apply(x)
-
-
-def reduce_in_model_parallel(x):
-    return _ReduceInModelParallel.apply(x)
-
-
-def scatter_in_model_parallel(x):
-    return _ScatterInModelParallel.apply(x)
-
-
-def gather_in_model_parallel(x):
-    return _GatherInModelParallel.apply(x)
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
new file mode 100644
index 00000000000000..6a33611403ace0
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .mp_layers import VocabParallelEmbedding  # noqa: F401
+from .mp_layers import ColumnParallelLinear  # noqa: F401
+from .mp_layers import RowParallelLinear  # noqa: F401
+from .pp_layers import LayerDesc  # noqa: F401
+from .pp_layers import PipelineLayer  # noqa: F401
+from .random import RNGStatesTracker  # noqa: F401
+from .random import model_parallel_random_seed  # noqa: F401
+from .random import get_rng_state_tracker  # noqa: F401
+
+__all__ = []
diff --git a/python/paddle/distributed/fleet/meta_parallel/mp_utils/layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
similarity index 86%
rename from python/paddle/distributed/fleet/meta_parallel/mp_utils/layers.py
rename to python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
index b7512afd9a6de8..af59b16e22aa85 100644
--- a/python/paddle/distributed/fleet/meta_parallel/mp_utils/layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
@@ -18,11 +18,8 @@
 from paddle.nn import functional as F
 from paddle import framework
 from ...base import topology as tp
-from .layers_help import identity_in_model_parallel, gather_in_model_parallel, reduce_in_model_parallel, scatter_in_model_parallel
 
-__all__ = [
-    'VocabParallelEmbedding', 'ColumnParallelLinear', 'RowParallelLinear'
-]
+__all__ = []
 
 # Follow this paper to achieve the file:
 # Shoeybi M, Patwary M, Puri R, et al. Megatron-lm: Training multi-billion parameter 
@@ -75,8 +72,13 @@ def forward(self, x):
         if len(origin_input_shape) == 2:
             x_shard = paddle.squeeze(x_shard, axis=-1)
 
-        emb_out_ = self.embedding(x_shard)
-        emb_out = reduce_in_model_parallel(emb_out_)
+        emb_out = self.embedding(x_shard)
+        if self.world_size > 1:
+            emb_out = paddle.distributed.collective._mp_allreduce(
+                emb_out,
+                group=self.model_parallel_group,
+                use_calc_stream=True,
+                use_model_parallel=True)
         return emb_out
 
 
@@ -123,11 +125,16 @@ def __init__(self,
             self.bias = None
 
     def forward(self, x):
-        input_parallel = identity_in_model_parallel(x)
+        # use inner api to process identity
+        input_parallel = paddle.distributed.collective._c_identity(
+            x, group=self.model_parallel_group)
         output_parallel = F.linear(
             input_parallel, self.weight, self.bias, name=self.name)
         if self.gather_output:
-            output = gather_in_model_parallel(output_parallel)
+            output = paddle.distributed.collective._c_concat(
+                output_parallel,
+                nranks=self.world_size,
+                group=self.model_parallel_group)
         else:
             output = output_parallel
         return output
@@ -182,9 +189,18 @@ def forward(self, x):
             input_parallel = x
         else:
             # split last dim
-            input_parallel = scatter_in_model_parallel(x)
+            input_parallel = paddle.distributed.collective._c_split(
+                x,
+                rank=self.rank,
+                nranks=self.world_size,
+                group=self.model_parallel_group)
 
         output_parallel = F.linear(input_parallel, self.weight, name=self.name)
-        output_ = reduce_in_model_parallel(output_parallel)
+        output_ = paddle.distributed.collective._mp_allreduce(
+            output_parallel,
+            group=self.model_parallel_group,
+            use_calc_stream=True,
+            use_model_parallel=True)
+
         output = output_ + self.bias if self.bias is not None else output_
         return output
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
new file mode 100644
index 00000000000000..77be62ae6cf4b4
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -0,0 +1,155 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import paddle
+from paddle.fluid.dygraph.layers import Layer
+from ...utils.log_util import logger, layer_to_str
+
+__all__ = []
+
+
+class SegmentLayers(object):
+    def __init__(self, layers_desc, num_parts, method="uniform"):
+        self._layers_desc = layers_desc
+        self.method = method
+        self.num_parts = num_parts
+        self.num_items = len(layers_desc)
+        assert self.num_items >= self.num_parts, "layer number should be greater than number of segments"
+
+    def do_segment(self):
+        if self.method == "uniform":
+            return self.uniform(self.num_items, self.num_parts)
+
+    def uniform(self, num_items, num_parts):
+        result = [0 for _ in range(num_parts + 1)]
+        part_size = math.floor(num_items / num_parts)
+        for i in range(num_parts):
+            result[i] = int(min(part_size * i, num_items))
+        result[num_parts] = num_items
+        return result
+
+
+class LayerDesc(object):
+    def __init__(self, layer_func, *inputs, **kwargs):
+        self.layer_func = layer_func
+        self.inputs = inputs
+        self.kwargs = kwargs
+
+        if not issubclass(layer_func, Layer):
+            raise TypeError(
+                "The input(layer_func) should be a derived class of Layer.")
+
+    def build_layer(self):
+        return self.layer_func(*self.inputs, **self.kwargs)
+
+    def __repr__(self):
+        return layer_to_str(self.layer_func.__name__, *self.inputs,
+                            **self.kwargs)
+
+
+class PipelineLayer(Layer):
+    def __init__(self,
+                 layers,
+                 num_stages=None,
+                 topology=None,
+                 loss_fn=None,
+                 seg_method="uniform"):
+        super(PipelineLayer, self).__init__()
+        if num_stages is None and topology is None:
+            raise ValueError("should provide num_stages or topology")
+
+        # lazy import
+        import paddle.distributed as dist
+        from paddle.distributed import fleet
+
+        self.device_id = dist.ParallelEnv().device_id
+        self.layers = layers
+        self._loss_fn = loss_fn
+        self._topo = topology
+        world_size = dist.get_world_size()
+        self.global_rank = dist.get_rank()
+
+        if self._topo:
+            self._stage_id = self._topo.get_coord(self.global_rank).pipe
+            self._num_stages = self._topo.get_dim_size("pipe")
+            if num_stages:
+                assert self._num_stages == num_stages, "num_stages should be equal to be %d" % (
+                    self._num_stages)
+        else:
+            # construct default topology
+            if world_size % num_stages != 0:
+                raise ValueError("should provide correct num_stages({}) "
+                                 "which can be divided by world_size({})".
+                                 format(num_stages, world_size))
+            dp_num = world_size // num_stages
+            self._topo = fleet.CommunicateTopology(["data", "pipe", "model"],
+                                                   [dp_num, num_stages, 1])
+            self._stage_id = self._topo.get_coord(self.global_rank).pipe
+            self._num_stages = self._topo.get_dim_size("pipe")
+
+        # initialize segment
+        self._layers_desc = list(self.layers)
+        self._num_layers = len(self._layers_desc)
+        self._start_pos = 0
+        self._end_pos = self._num_layers - 1
+        self._segment_network(seg_method)
+
+        # construct layer
+        self.run_function = []
+        self._build_layer()
+
+    def _segment_network(self, seg_method):
+        logger.info("start segment network..")
+        seg = SegmentLayers(
+            self._layers_desc, num_parts=self._num_stages, method=seg_method)
+        self.segment_parts = seg.do_segment()
+
+        self._start_pos = self.segment_parts[self._stage_id]
+        self._end_pos = self.segment_parts[self._stage_id + 1]
+
+        # print information for debug
+        for stage in range(self._num_stages):
+            start = self.segment_parts[stage]
+            end = self.segment_parts[stage + 1]
+            logger.info("stage={}, global_rank={} ,layer_number={}".format(
+                stage, self.global_rank, end - start))
+
+            for index, layer in enumerate(self._layers_desc[start:end]):
+                logger.info("{}: {}".format(index + start, str(layer)))
+
+        if self._loss_fn:
+            try:
+                logger.info("loss: {}".format(self._loss_fn.__name__))
+            except AttributeError:
+                logger.info("loss: {}".format(self._loss_fn.__class__.__name__))
+
+    def _build_layer(self):
+        start = self._start_pos
+        end = self._end_pos
+        for index, layer in enumerate(self._layers_desc[start:end]):
+            layer_index = start + index
+            if isinstance(layer, Layer):
+                self.run_function.append(layer)
+                self.add_sublayer(str(layer_index), layer)
+            elif isinstance(layer, LayerDesc):
+                model = layer.build_layer()
+                self.run_function.append(model)
+                self.add_sublayer(str(layer_index), model)
+            else:
+                self.run_function.append(layer)
+
+    def forward(self, input):
+        for layer in self.run_function:
+            input = layer(input)
+        return input
diff --git a/python/paddle/distributed/fleet/meta_parallel/mp_utils/random.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
similarity index 96%
rename from python/paddle/distributed/fleet/meta_parallel/mp_utils/random.py
rename to python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
index 56c741dbd3cad5..41c9deabd1e11b 100644
--- a/python/paddle/distributed/fleet/meta_parallel/mp_utils/random.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
@@ -14,9 +14,8 @@
 
 import paddle
 import contextlib
-__all__ = [
-    'RNGStatesTracker', 'model_parallel_random_seed', 'get_rng_state_tracker'
-]
+
+__all__ = []
 
 MODEL_PARALLEL_RNG = 'model_parallel_rng'
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
new file mode 100644
index 00000000000000..79e5bc2ffeda06
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -0,0 +1,498 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import time
+import copy
+import os
+
+from types import MethodType
+
+from numpy import prod
+
+import paddle
+import paddle.fluid as fluid
+from .meta_parallel_base import MetaParallelBase
+from .pp_utils.utils import get_tensor_bytes, is_float_tensor
+from .pp_utils import utils
+from .parallel_layers.pp_layers import PipelineLayer
+
+from ..utils.hybrid_parallel_util import broadcast_mp_parameters
+from ..utils.hybrid_parallel_util import broadcast_dp_parameters
+from ..utils.hybrid_parallel_util import fused_allreduce_gradients
+from ..utils.log_util import logger
+
+__all__ = []
+
+FLOAT_TYPES = [
+    paddle.float16,
+    paddle.float32,
+    paddle.float64,
+]
+
+
+class PipelineParallel(MetaParallelBase):
+    def __init__(self, layers, hcg, strategy):
+        super(PipelineParallel, self).__init__(layers, hcg, strategy)
+
+        self.use_pipe_parallel = self._hcg.get_pipe_parallel_world_size() > 1
+        self.use_data_parallel = self._hcg.get_data_parallel_world_size() > 1
+        self.use_model_parallel = self._hcg.get_model_parallel_world_size() > 1
+
+        self.num_caches = 0
+        self.caches = {
+            'inputs': [],
+            'labels': [],
+            'outputs': [],
+        }
+
+        self.recv_cache = None
+        self.grad_tensors = None
+
+        self.send_meta = True
+
+        self.current_loss = paddle.to_tensor(0.0)
+        self.total_loss = None
+
+        self.use_amp = self._strategy.amp
+        self.init_loss_scaling = self._strategy.amp_configs['init_loss_scaling']
+        self.micro_batch_size = self._strategy.pipeline_configs[
+            'micro_batch_size']
+        self.accumulate_steps = self._strategy.pipeline_configs[
+            'accumulate_steps']
+
+        self.num_stages = self._hcg.get_pipe_parallel_world_size()
+        self.stage_id = self._hcg.get_stage_id()
+        self.prev_stage_id = self.stage_id - 1
+        self.next_stage_id = self.stage_id + 1
+        self.pp_group = self._hcg.get_pipe_parallel_group()
+        logger.info("Pipeline Info -- num_stages: {}, stage_id: {}".format(
+            self.num_stages, self.stage_id))
+
+        if self.use_model_parallel:
+            logger.info("start broadcast mp parameters")
+            broadcast_mp_parameters(self._layers, self._hcg)
+
+        if self.use_data_parallel:
+            logger.info("start broadcast mp parameters")
+            broadcast_dp_parameters(self._layers, self._hcg)
+
+    def _allocate_caches(self, num_caches):
+        if self.num_caches >= num_caches:
+            return
+
+        num = num_caches - self.num_caches
+        self.num_caches = num_caches
+        for key in self.caches:
+            self.caches[key].extend([None] * num)
+
+    def train_batch(self, data, optimizer):
+        self.optimizer = optimizer
+        assert fluid.framework._dygraph_tracer()._has_grad, (
+            'Please enable the generation of gradients.')
+
+        if self.stage_id == 0 or self.stage_id == self.num_stages - 1:
+            assert data, (
+                "For the first and the last stage, the data_iter must be set.")
+        else:
+            assert data is None, (
+                "For pipe stages other than the first and the last one, "
+                "the data_iter must be None.")
+        self.data = data
+        self._layers.train()
+        self.total_loss = None
+
+        minibatch_cmds = utils.TrainGenerator(self.accumulate_steps,
+                                              self.num_stages, self.stage_id)
+        self._train(minibatch_cmds)
+        return self.total_loss
+
+    def _train(self, minibatch_cmds):
+        self._allocate_caches(self.accumulate_steps)
+        for micro_cmds in minibatch_cmds:
+            for cmd in micro_cmds:
+                assert type(cmd) in self._COMMAND_MAP, "unknow cmd: {}".format(
+                    type(cmd))
+                self._apply_cmd = MethodType(self._COMMAND_MAP[type(cmd)], self)
+                self._apply_cmd(**cmd.kwargs)
+
+    def _allreduce_grads(self):
+        if not self.use_data_parallel: return
+        fused_allreduce_gradients(list(self._layers.parameters()), self._hcg)
+
+    def _forward(self, cache_id):
+        # load data
+        self._load_micro_batch(cache_id)
+        if self.stage_id != 0:
+            self._recv_activations(cache_id)
+
+        if isinstance(self.caches['inputs'][cache_id], tuple):
+            inputs = tuple(t for t in self.caches['inputs'][cache_id])
+        else:
+            inputs = self.caches['inputs'][cache_id]
+
+        self._clear_grads(inputs)
+        outputs = self._layers.forward(inputs)
+        self.caches['outputs'][cache_id] = outputs
+
+        if self.stage_id == self.num_stages - 1:
+            if self._layers._loss_fn is not None:
+                labels = self.caches['labels'][cache_id]
+                outputs = self._layers._loss_fn(outputs, labels)
+
+        if self.stage_id == self.num_stages - 1:
+            self.current_loss = outputs
+            if isinstance(self.current_loss, paddle.Tensor):
+                if self.total_loss is None:
+                    self.total_loss = paddle.zeros_like(self.current_loss)
+                self.total_loss += self.current_loss.detach()
+            else:
+                if self.total_loss is None:
+                    self.total_loss = [
+                        paddle.zeros_like(v) for v in self.current_loss
+                    ]
+                for idx, v in enumerate(self.current_loss):
+                    self.total_loss[idx] += v.detach()
+            if self.use_data_parallel:
+                self.current_loss = self.current_loss / self._hcg.get_data_parallel_world_size(
+                )
+            if self.accumulate_steps > 1:
+                self.current_loss = self.current_loss / self.accumulate_steps
+            self.caches['outputs'][cache_id] = self.current_loss.clone()
+        else:
+            self._send_activations(cache_id)
+
+    def _backward(self, cache_id):
+        assert self.optimizer is not None
+        if self.stage_id == self.num_stages - 1:
+            paddle.autograd.backward(self.caches['outputs'][cache_id])
+            self._send_gradients(cache_id)
+            return
+        self._recv_gradients(cache_id)
+
+        outputs = self.caches['outputs'][cache_id]
+
+        grad_tensors = self.grad_tensors
+        if isinstance(outputs, tuple):
+            out_tensors = [t for t in outputs if is_float_tensor(t)]
+            assert len(out_tensors) == len(grad_tensors)
+            paddle.autograd.backward(
+                tensors=out_tensors, grad_tensors=grad_tensors)
+        else:
+            paddle.autograd.backward(
+                tensors=[outputs], grad_tensors=[grad_tensors])
+
+        grad_tensors = None
+        if self.stage_id != 0: self._send_gradients(cache_id)
+        self.caches['outputs'][cache_id] = None
+        #self.caches['backward_tensors'][cache_id] = None
+
+    def _get_data(self):
+        if self.use_model_parallel:
+            mp_rank = self._hcg.get_model_parallel_rank()
+        else:
+            mp_rank = 0
+
+        # mp rank 0 loads the data and broadcat it to others.
+        data = self.data
+        if self.use_model_parallel and (self.stage_id == 0 or
+                                        self.stage_id == self.num_stages - 1):
+            assert isinstance(data, (tuple, paddle.Tensor))
+            if isinstance(data, paddle.Tensor):
+                paddle.distributed.broadcast(
+                    data,
+                    src=self._hcg.get_model_parallel_group_src_rank(),
+                    group=self._hcg.get_model_parallel_group())
+            else:
+                data = []
+                for d in self.data:
+                    assert isinstance(d, paddle.Tensor)
+                    paddle.distributed.broadcast(
+                        d,
+                        src=self._hcg.get_model_parallel_group_src_rank(),
+                        group=self._hcg.get_model_parallel_group())
+                    data.append(d)
+            data = tuple(data)
+        return data
+
+    def _load_micro_batch(self, cache_id):
+        inputs = self._get_data()
+
+        if self.stage_id == 0:
+            data = None
+            #if isinstance(inputs[0], paddle.Tensor):
+            if len(inputs) == 1:
+                assert isinstance(inputs[0], paddle.Tensor)
+                data = inputs[0].clone().detach()
+                #data.stop_gradient = not is_float_tensor(data)
+                data.stop_gradient = True
+            else:
+                assert isinstance(inputs, tuple)
+                data = []
+                for d in inputs:
+                    assert isinstance(d, paddle.Tensor)
+                    i = d.clone().detach()
+                    #i.stop_gradient = not is_float_tensor(i)
+                    i.stop_gradient = True
+                    data.append(i)
+                data = tuple(data)
+            self.caches['inputs'][cache_id] = data
+
+        if self.stage_id == self.num_stages - 1:
+            labels = None
+            #if isinstance(inputs[1], paddle.Tensor):
+            if len(inputs) == 1:
+                assert isinstance(inputs[0], paddle.Tensor)
+                labels = inputs[0]
+            elif isinstance(inputs, tuple):
+                labels = []
+                for label in inputs:
+                    assert isinstance(label, paddle.Tensor)
+                    label = label.detach()
+                    labels.append(label)
+                labels = tuple(labels)
+            self.caches['labels'][cache_id] = labels
+
+    def _send_meta(self, data, peer):
+        """
+        % type (0: tensor, 1: tuple)
+        % num_tensors if type=tuple
+        foreach tensor:
+          % ndims
+          % shape
+        """
+        if isinstance(data, paddle.Tensor):
+            tensor_type = paddle.to_tensor([0])
+            paddle.distributed.send(
+                tensor_type, peer, use_calc_stream=True, group=self.pp_group)
+            dims = paddle.to_tensor(len(data.shape))
+            paddle.distributed.send(
+                dims, peer, use_calc_stream=True, group=self.pp_group)
+            shape = paddle.to_tensor(data.shape)
+            paddle.distributed.send(
+                shape, peer, use_calc_stream=True, group=self.pp_group)
+        elif isinstance(data, tuple):
+            tensor_type = paddle.to_tensor([1])
+            paddle.distributed.send(
+                tensor_type, peer, use_calc_stream=True, group=self.pp_group)
+            nums = paddle.to_tensor(len(data))
+            paddle.distributed.send(
+                nums, peer, use_calc_stream=True, group=self.pp_group)
+            for idx, d in enumerate(data):
+                assert isinstance(d, paddle.Tensor)
+                dims = paddle.to_tensor(len(d.shape))
+                paddle.distributed.send(
+                    dims, peer, use_calc_stream=True, group=self.pp_group)
+                shape = paddle.to_tensor(d.shape)
+                paddle.distributed.send(
+                    shape, peer, use_calc_stream=True, group=self.pp_group)
+
+    def _recv_meta(self, peer):
+        tensor_type = paddle.to_tensor([0])
+        paddle.distributed.recv(
+            tensor_type, peer, use_calc_stream=True, group=self.pp_group)
+        tensor_type = tensor_type.numpy()[0]
+
+        if tensor_type == 0:
+            dims = paddle.to_tensor([0])
+            paddle.distributed.recv(
+                dims, peer, use_calc_stream=True, group=self.pp_group)
+            dims = dims.numpy()[0]
+            shape = paddle.to_tensor([0] * dims)
+            paddle.distributed.recv(
+                shape, peer, use_calc_stream=True, group=self.pp_group)
+            shape = shape.numpy().tolist()
+            return self._allocate_buffer(
+                shape, dtype="float32", num_caches=1)[0]
+        elif tensor_type == 1:
+            num = paddle.to_tensor([0])
+            paddle.distributed.recv(
+                num, peer, use_calc_stream=True, group=self.pp_group)
+            num = num.numpy()[0]
+            shapes = []
+            for i in range(num):
+                dims = paddle.to_tensor([0])
+                paddle.distributed.recv(
+                    dims, peer, use_calc_stream=True, group=self.pp_group)
+                dims = dims.numpy()[0]
+                shape = paddle.to_tensor([0] * dims)
+                paddle.distributed.recv(
+                    shape, peer, use_calc_stream=True, group=self.pp_group)
+                shapes.append(shape.numpy().tolist())
+
+            dtypes = ["float32"] * len(shapes)
+            caches = self._allocate_buffers(shapes, dtypes, num_caches=1)[0]
+            caches = tuple(caches)
+            return caches
+
+    def _send_activations(self, cache_id):
+        outputs = self.caches['outputs'][cache_id]
+
+        if self.send_meta:
+            self.send_meta = False
+            self._send_meta(outputs, self.next_stage_id)
+
+        if isinstance(outputs, paddle.Tensor):
+            paddle.distributed.send(
+                outputs,
+                self.next_stage_id,
+                use_calc_stream=True,
+                group=self.pp_group)
+        elif isinstance(outputs, tuple):
+            for output in outputs:
+                paddle.distributed.send(
+                    output,
+                    self.next_stage_id,
+                    use_calc_stream=True,
+                    group=self.pp_group)
+
+    def _send_gradients(self, cache_id):
+        inputs = self.caches['inputs'][cache_id]
+
+        if isinstance(inputs, paddle.Tensor):
+            assert inputs.grad is not None
+            paddle.distributed.send(
+                paddle.to_tensor(inputs.grad),
+                self.prev_stage_id,
+                use_calc_stream=True,
+                group=self.pp_group)
+        else:
+            for idx, d in enumerate(inputs):
+                # Skip tensors that will not produce a grad
+                if not is_float_tensor(d):
+                    assert d.grad is None
+                    continue
+                assert d.grad is not None
+                paddle.distributed.send(
+                    d.grad,
+                    self.prev_stage_id,
+                    use_calc_stream=True,
+                    group=self.pp_group)
+        self.caches['inputs'][cache_id] = None
+
+    def _recv_activations(self, cache_id):
+        inputs = None
+
+        # Allocate the buffer if necessary
+        if self.recv_cache is None:
+            self.recv_cache = self._recv_meta(self.prev_stage_id)
+
+        if isinstance(self.recv_cache, paddle.Tensor):
+            paddle.distributed.recv(
+                self.recv_cache,
+                self.prev_stage_id,
+                use_calc_stream=True,
+                group=self.pp_group)
+            inputs = self.recv_cache.clone().detach()
+            inputs.stop_gradient = not is_float_tensor(inputs)
+        else:
+            assert isinstance(self.recv_cache, tuple)
+            inputs = [None] * len(self.recv_cache)
+            for idx, d in enumerate(self.recv_cache):
+                assert isinstance(d, paddle.Tensor)
+
+                paddle.distributed.recv(
+                    d,
+                    self.prev_stage_id,
+                    use_calc_stream=True,
+                    group=self.pp_group)
+                inputs[idx] = d.clone().detach()
+
+            inputs = tuple(inputs)
+
+            for d in inputs:
+                d.stop_gradient = not is_float_tensor(d)
+
+        self.caches['inputs'][cache_id] = inputs
+
+    def _recv_gradients(self, cache_id):
+        outputs = self.caches['outputs'][cache_id]
+        if self.grad_tensors is None:
+            if isinstance(outputs, paddle.Tensor):
+                s = list(outputs.shape)
+                dtype = 'float16' if self.use_amp else "float32"
+                self.grad_tensors = self._allocate_buffer(
+                    s, dtype, num_buffers=1)[0]
+            else:
+                sizes = [list(d.shape) for d in outputs if is_float_tensor(d)]
+                dtypes = ['float16'] * len(
+                    sizes) if self.use_amp else ['float32'] * len(sizes)
+                self.grad_tensors = self._allocate_buffers(
+                    sizes, dtypes, num_caches=1)[0]
+
+        if isinstance(self.grad_tensors, paddle.Tensor):
+            paddle.distributed.recv(
+                self.grad_tensors,
+                self.next_stage_id,
+                use_calc_stream=True,
+                group=self.pp_group)
+        else:
+            assert isinstance(outputs, tuple)
+            for d in self.grad_tensors:
+                paddle.distributed.recv(
+                    d,
+                    self.next_stage_id,
+                    use_calc_stream=True,
+                    group=self.pp_group)
+
+    def _step(self):
+        self._allreduce_grads()
+        self.optimizer.step()
+        self.optimizer.clear_gradients()
+
+    def _clear_grads(self, inputs):
+        if isinstance(inputs, paddle.Tensor):
+            if inputs.grad is not None:
+                inputs.clear_gradient()
+        else:
+            for d in inputs:
+                if d.grad is not None:
+                    d.clear_gradient()
+
+    def _allocate_zeros(self, shape, dtype):
+        return paddle.zeros(shape, dtype)
+
+    def _allocate_buffer(self, shape, dtype, num_caches=-1):
+        caches = []
+        if num_caches == -1:
+            num_caches = self.num_caches
+        for count in range(num_caches):
+            caches.append(self._allocate_zeros(shape, dtype))
+        return caches
+
+    def _allocate_buffers(self, shapes, dtypes, num_caches=-1):
+        caches = []
+        if num_caches == -1:
+            num_caches = self.num_caches
+        for count in range(num_caches):
+            cache = []
+            for shape, dtype in zip(shapes, dtypes):
+                cache.append(self._allocate_zeros(shape, dtype))
+            caches.append(cache)
+        return caches
+
+    def save_state_dict(self, model_path):
+        state_dict = self._layers.state_dict()
+        paddle.save(state_dict, model_path)
+
+    def load_state_dict(self, model_path):
+        state_dict = paddle.load(self.model_path)
+        self._layers.set_state_dict(state_dict)
+
+    _COMMAND_MAP = {
+        utils.Optimize: _step,
+        utils.Forward: _forward,
+        utils.Backward: _backward,
+    }
+
+    def forward(self, *inputs, **kwargs):
+        raise RuntimeError("Call train_batch for pipeline instead of forward.")
diff --git a/python/paddle/distributed/fleet/meta_parallel/mp_utils/__init__.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/__init__.py
similarity index 92%
rename from python/paddle/distributed/fleet/meta_parallel/mp_utils/__init__.py
rename to python/paddle/distributed/fleet/meta_parallel/pp_utils/__init__.py
index a7da28700bceb0..786eb20487a52e 100644
--- a/python/paddle/distributed/fleet/meta_parallel/mp_utils/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/__init__.py
@@ -12,5 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .layers import *
-from .random import *
+from .utils import get_tensor_bytes
+
+__all__ = []
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
new file mode 100644
index 00000000000000..e5c5709f98d957
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+import paddle
+from ...utils import hybrid_parallel_util as hp_util
+
+__all__ = []
+
+FLOAT_TYPES = [
+    paddle.float16,
+    paddle.float32,
+    paddle.float64,
+]
+
+
+def is_float_tensor(tensor):
+    """Is a float tensor"""
+    return tensor.dtype in FLOAT_TYPES
+
+
+def get_tensor_bytes(tensor):
+    """Get the bytes a tensor occupied."""
+    elem_size = None
+    if tensor.dtype == paddle.float32:
+        elem_size = 4
+    elif tensor.dtype == paddle.float64:
+        elem_size = 8
+    elif tensor.dtype == paddle.int64:
+        elem_size = 8
+    elif tensor.dtype == paddle.int32:
+        elem_size = 4
+    elif tensor.dtype == paddle.float16:
+        elem_size = 2
+    elif tensor.dtype == paddle.int8:
+        elem_size = 1
+    else:
+        raise ValueError("unknown data type: {}".format(tensor.dtype))
+    return tensor.numel() * elem_size
+
+
+class Generator():
+    def __init__(self, micro_batches, stages, stage_id):
+        __metaclass__ = abc.ABCMeta
+
+        self.micro_batches = micro_batches
+        self.stages = stages
+        self.stage_id = stage_id
+        self.prev_stage = self.stage_id - 1
+        self.next_stage = self.stage_id + 1
+
+    @abc.abstractmethod
+    def generate(self):
+        pass
+
+    def __iter__(self):
+        self.iter = None
+        return self
+
+    def __next__(self):
+        if self.iter is None:
+            self.iter = self.generate()
+        return next(self.iter)
+
+
+class TrainGenerator(Generator):
+    def generate(self):
+        startup_steps = self.stages - self.stage_id - 1
+        cmds = []
+        forward_steps = 0
+        backward_steps = 0
+        #while (forward_steps < startup_steps):
+        #    cmds.append(Forward(cache_id=forward_steps))
+        #    forward_steps += 1
+        #while (forward_steps < self.micro_batches):
+        #    cmds.append(Forward(cache_id=forward_steps))
+        #    forward_steps += 1
+        #    cmds.append(Backward(cache_id=backward_steps))
+        #    backward_steps += 1
+        #while (backward_steps < self.micro_batches):
+        #    cmds.append(Backward(cache_id=backward_steps))
+        #    backward_steps += 1
+        #cmds.append(Optimize())
+        while (forward_steps < self.micro_batches):
+            cmds.append(Forward(cache_id=forward_steps))
+            forward_steps += 1
+        while (backward_steps < self.micro_batches):
+            cmds.append(Backward(cache_id=backward_steps))
+            backward_steps += 1
+        cmds.append(Optimize())
+        yield cmds
+
+
+class Command:
+    def __init__(self, **kwargs):
+        self.name = self.__class__.__name__
+        self.kwargs = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+    def __repr__(self):
+        return hp_util.call_to_str(self.name, **self.kwargs)
+
+
+class Optimize(Command):
+    pass
+
+
+class Forward(Command):
+    pass
+
+
+class Backward(Command):
+    pass
diff --git a/python/paddle/distributed/fleet/metrics/__init__.py b/python/paddle/distributed/fleet/metrics/__init__.py
index bc30c063787d28..abcb90afb23c43 100644
--- a/python/paddle/distributed/fleet/metrics/__init__.py
+++ b/python/paddle/distributed/fleet/metrics/__init__.py
@@ -12,15 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .metric import *
+from .metric import acc  # noqa: F401
+from .metric import auc  # noqa: F401
+from .metric import mae  # noqa: F401
+from .metric import max  # noqa: F401
+from .metric import min  # noqa: F401
+from .metric import mse  # noqa: F401
+from .metric import rmse  # noqa: F401
+from .metric import sum  # noqa: F401
 
-__all__ = [
-    "sum",
-    "max",
-    "min",
-    "auc",
-    "mae",
-    "rmse",
-    "mse",
-    "acc",
-]
+__all__ = []
diff --git a/python/paddle/distributed/fleet/metrics/metric.py b/python/paddle/distributed/fleet/metrics/metric.py
index 9ed0a0df4be018..d2050585df754b 100644
--- a/python/paddle/distributed/fleet/metrics/metric.py
+++ b/python/paddle/distributed/fleet/metrics/metric.py
@@ -18,6 +18,8 @@
 from paddle.static import Variable
 import paddle
 
+__all__ = []
+
 
 def sum(input, scope=None, util=None):
     """
diff --git a/python/paddle/distributed/fleet/runtime/__init__.py b/python/paddle/distributed/fleet/runtime/__init__.py
index 51d8c6ffebf1dd..f5c30b2f3c5aaa 100644
--- a/python/paddle/distributed/fleet/runtime/__init__.py
+++ b/python/paddle/distributed/fleet/runtime/__init__.py
@@ -15,3 +15,5 @@
 from .collective_runtime import CollectiveRuntime
 from .parameter_server_runtime import ParameterServerRuntime
 from .the_one_ps import TheOnePSRuntime
+
+__all__ = []
diff --git a/python/paddle/distributed/fleet/runtime/collective_runtime.py b/python/paddle/distributed/fleet/runtime/collective_runtime.py
index c56cf4c7aa2ed8..a23b15f1fca1ba 100644
--- a/python/paddle/distributed/fleet/runtime/collective_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/collective_runtime.py
@@ -15,6 +15,8 @@
 from .runtime_base import RuntimeBase
 import logging
 
+__all__ = []
+
 
 class CollectiveRuntime(RuntimeBase):
     def __init__(self):
diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
index 782ba87e07925c..0767158d23f008 100644
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
@@ -26,6 +26,8 @@
 from .runtime_base import RuntimeBase
 from ..base.private_helper_function import wait_server_ready
 
+__all__ = []
+
 
 class ParameterServerRuntime(RuntimeBase):
     def __init__(self):
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index df07a7a6e77835..ce68eb9a1fb4ad 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -25,6 +25,8 @@
 from .runtime_base import RuntimeBase
 from ..base.private_helper_function import wait_server_ready
 
+__all__ = []
+
 
 def conv_indent(indent):
     return "".join([" "] * indent)
@@ -77,10 +79,13 @@ def define_optimize_map(self):
                                  ("Moment2", None), ("Beta1Pow", 1),
                                  ("Beta2Pow", 1), ("LearningRate", 1)]
         opt_input_map["sum"] = [("Param", None)]
+        opt_input_map["naive_adagrad"] = [("Param", None), ("G2Sum", 1),
+                                          ("LearningRate", 1)]
 
         opt_attr_map = {}
         opt_attr_map["sgd"] = []
         opt_attr_map["sum"] = []
+        opt_attr_map["naive_adagrad"] = []
         opt_attr_map["adam"] = [("beta1", "f"), ("beta2", "f"),
                                 ("epsilon", "f")]
 
@@ -169,6 +174,10 @@ def parse_by_optimizer(self, grad_name, is_sparse, total_dims,
             param_varnames = self.opt_input_map["sum"]
             attr_varnames = self.opt_attr_map["sum"]
             self.accessor_class = "sum"
+        elif compiled_strategy.use_ps_gpu and is_sparse:
+            param_varnames = self.opt_input_map["naive_adagrad"]
+            attr_varnames = self.opt_attr_map["naive_adagrad"]
+            self.accessor_class = "sgd"
         else:
             param_varnames = self.opt_input_map[oop.type]
             attr_varnames = self.opt_attr_map[oop.type]
@@ -176,20 +185,28 @@ def parse_by_optimizer(self, grad_name, is_sparse, total_dims,
 
         for (formal_name, shape) in param_varnames:
             params.append(formal_name)
-            param = main_program.global_block().vars[oop.input(formal_name)[0]]
-            if formal_name == "LearningRate" and param.name != "learning_rate_0":
-                warnings.warn("will support decay soon")
-                param = main_program.global_block().vars["learning_rate_0"]
-
-            if shape is None:
-                if is_sparse:
-                    shape = total_dims
-                else:
-                    shape = self.get_shard(total_dims, pserver_num, pserver_id)
-            dims.append(shape)
+            if formal_name == "G2Sum":
+                dims.append(1)
+                initializer = "fill_constant&0"
+                initializers.append(initializer)
+            else:
+                param = main_program.global_block().vars[oop.input(formal_name)[
+                    0]]
+                if formal_name == "LearningRate" and param.name != "learning_rate_0":
+                    warnings.warn("will support decay soon")
+                    param = main_program.global_block().vars["learning_rate_0"]
+
+                if shape is None:
+                    if is_sparse:
+                        shape = total_dims
+                    else:
+                        shape = self.get_shard(total_dims, pserver_num,
+                                               pserver_id)
+                dims.append(shape)
 
-            initializer = self.get_initializer_attr(param.name, startup_program)
-            initializers.append(initializer)
+                initializer = self.get_initializer_attr(param.name,
+                                                        startup_program)
+                initializers.append(initializer)
 
         for (attr_varname, type_) in attr_varnames:
             value = oop.attr(attr_varname)
@@ -435,6 +452,8 @@ def _get_distributed_strategy(self):
         if not strategy:
             raise ValueError("k_steps must be invalid value, please check")
 
+        if dist_strategy.a_sync_configs["use_ps_gpu"]:
+            strategy.use_ps_gpu = True
         return strategy
 
     def build_compiled_startegy(self):
@@ -443,6 +462,8 @@ def build_compiled_startegy(self):
         compiled_config = CompileTimeStrategy(
             self.origin_main_program, self.origin_main_program,
             self.async_strategy, self.role_maker)
+        if self.async_strategy.use_ps_gpu:
+            compiled_config.use_ps_gpu = True
         return compiled_config
 
     def _init_worker(self):
diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
index 774e8db0df52c9..1bf90a22e375c7 100644
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -12,5 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .fs import LocalFS, HDFSClient
-from .ps_util import DistributedInfer
+from .fs import LocalFS  # noqa: F401
+from .fs import HDFSClient  # noqa: F401
+from .ps_util import DistributedInfer  # noqa: F401
+from .recompute import recompute  # noqa: F401
+
+from . import log_util  # noqa: F401
+from . import hybrid_parallel_util  # noqa: F401
+
+__all__ = [  #noqa
+    "LocalFS", "recompute", "DistributedInfer", "HDFSClient"
+]
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index 7e62e551fe8d53..087942e70a2263 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -31,7 +31,7 @@
 
 import shutil
 
-__all__ = ['LocalFS', 'HDFSClient']
+__all__ = []
 
 
 class ExecuteError(Exception):
diff --git a/python/paddle/distributed/fleet/utils/http_server.py b/python/paddle/distributed/fleet/utils/http_server.py
index 92295cc74ae4d4..a9d0687461b995 100644
--- a/python/paddle/distributed/fleet/utils/http_server.py
+++ b/python/paddle/distributed/fleet/utils/http_server.py
@@ -28,6 +28,8 @@
 import threading
 import socket
 
+__all__ = []
+
 
 def get_logger(name, level, fmt):
     logger = logging.getLogger(name)
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
new file mode 100644
index 00000000000000..5521bd5b952837
--- /dev/null
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import six
+import numpy as np
+import warnings
+
+from paddle import framework
+import paddle
+from paddle.fluid import core
+from paddle.fluid.dygraph.parallel import _split_tensors, sync_params_buffers, build_groups
+from collections import OrderedDict
+from .log_util import logger
+
+__all__ = []
+
+
+def _apply_collective_grads(parameters, comm_group):
+    grad_var_set = set()
+    grad_vars = []
+    sparse_grad_vars = []
+
+    for param in parameters:
+        if param.trainable and (param._grad_ivar() is not None):
+            g_var = param._grad_ivar()
+            assert not g_var._is_sparse(
+            ), "Now, it doesn't support sparse parameters"
+            grad_vars.append(g_var)
+            assert g_var not in grad_var_set
+            grad_var_set.add(g_var)
+
+    coalesced_grads_and_vars = build_groups(grad_vars, 128 * 1024 * 1024)
+
+    for coalesced_grad, _, _ in coalesced_grads_and_vars:
+        # need to div nranks
+        coalesced_grad = coalesced_grad / comm_group.nranks
+        paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
+
+    _split_tensors(coalesced_grads_and_vars)
+
+
+def _broadcast_data_help(data, shape, dtype, hcg):
+    model_parallel_group = hcg.get_model_parallel_group()
+    src_rank = hcg.get_model_parallel_group_src_rank()
+    mp_rank = hcg.get_model_parallel_rank()
+
+    shape_gpu = paddle.to_tensor(shape, dtype="int32")
+    paddle.distributed.broadcast(
+        shape_gpu,
+        src=src_rank,
+        group=model_parallel_group,
+        use_calc_stream=True)
+
+    if mp_rank != 0:
+        input_data = paddle.zeros(shape_gpu, dtype=dtype)
+    else:
+        input_data = data
+
+    paddle.distributed.broadcast(
+        input_data,
+        src=src_rank,
+        group=model_parallel_group,
+        use_calc_stream=True)
+
+
+def broadcast_input_data(hcg, *inputs, **kwargs):
+    for v in inputs:
+        if isinstance(v, core.VarBase):
+            with framework.no_grad():
+                _broadcast_data_help(v, v.shape, v.dtype, hcg)
+        else:
+            logger.error("it doesn't support data type {}".format(type(v)))
+
+    for k, v in kwargs.items():
+        if isinstance(v, core.VarBase):
+            with framework.no_grad():
+                _broadcast_data_help(v, v.shape, v.dtype, hcg)
+            kwargs[k] = v
+        else:
+            logger.error("it doesn't support data type {}".format(type(v)))
+    return inputs, kwargs
+
+
+def broadcast_mp_parameters(model, hcg):
+    model_parallel_group = hcg.get_model_parallel_group()
+    src_rank = hcg.get_model_parallel_group_src_rank()
+    sync_params_buffers(
+        model, model_parallel_group, src_rank, is_model_parallel=True)
+
+
+def broadcast_dp_parameters(model, hcg):
+    data_parallel_group = hcg.get_data_parallel_group()
+    src_rank = hcg.get_data_parallel_group_src_rank()
+    sync_params_buffers(
+        model, data_parallel_group, src_rank, is_model_parallel=False)
+
+
+def fused_allreduce_gradients(parameter_list, hcg):
+    data_parallel_group = hcg.get_data_parallel_group()
+    logger.debug("dp start fuse allreduce gradients")
+    with framework.no_grad():
+        _apply_collective_grads(parameter_list, data_parallel_group)
diff --git a/python/paddle/distributed/fleet/utils/log_util.py b/python/paddle/distributed/fleet/utils/log_util.py
new file mode 100644
index 00000000000000..77eb641e0c6fe4
--- /dev/null
+++ b/python/paddle/distributed/fleet/utils/log_util.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import sys
+
+__all__ = []
+
+
+class LoggerFactory:
+    @staticmethod
+    def build_logger(name=None, level=logging.INFO):
+        assert name is not None, "name for logger should not be None"
+
+        formatter = logging.Formatter(
+            "%(asctime)s-%(levelname)s: "
+            "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s")
+
+        _logger = logging.getLogger(name)
+        _logger.setLevel(level)
+        _logger.propagate = False
+        handler = logging.StreamHandler(stream=sys.stderr)
+        handler.setFormatter(formatter)
+        handler.setLevel(level)
+        _logger.addHandler(handler)
+        return _logger
+
+
+logger = LoggerFactory.build_logger(name="HybridParallel", level=logging.INFO)
+
+
+def layer_to_str(base, *args, **kwargs):
+    name = base + "("
+    if args:
+        name += ", ".join(str(arg) for arg in args)
+        if kwargs:
+            name += ", "
+    if kwargs:
+        name += ", ".join("{}={}".format(key, str(value))
+                          for key, value in kwargs.items())
+    name += ")"
+    return name
diff --git a/python/paddle/distributed/fleet/utils/ps_util.py b/python/paddle/distributed/fleet/utils/ps_util.py
index 7bf7bec43de008..8bf69a41a7cc83 100644
--- a/python/paddle/distributed/fleet/utils/ps_util.py
+++ b/python/paddle/distributed/fleet/utils/ps_util.py
@@ -18,6 +18,8 @@
 import paddle
 import warnings
 
+__all__ = []
+
 
 class DistributedInfer:
     """
diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py
new file mode 100644
index 00000000000000..e58c8aa1625dde
--- /dev/null
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid import core
+from paddle.autograd import PyLayer
+from paddle.fluid import framework
+import contextlib
+
+import logging
+logger = logging.getLogger(__name__)
+formatter = logging.Formatter(
+    fmt='%(asctime)s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
+ch = logging.StreamHandler()
+ch.setFormatter(formatter)
+logger.addHandler(ch)
+
+__all__ = []
+
+
+def detach_variable(inputs):
+    out = []
+    for inp in inputs:
+        if not isinstance(inp, core.VarBase):
+            out.append(inp)
+            continue
+
+        x = inp.detach()
+        x.stop_gradient = inp.stop_gradient
+        out.append(x)
+    return tuple(out)
+
+
+def check_recompute_necessary(inputs):
+    if not any(input_.stop_gradient == False for input_ in inputs
+               if isinstance(input_, paddle.Tensor)):
+        logger.warn(
+            "[Recompute]: None of the inputs to current recompute block need grad, "
+            "therefore there is NO need to recompute this block in backward !")
+
+
+@contextlib.contextmanager
+def swith_rng_state(rng_state):
+    orig_cuda_rng_state = paddle.get_cuda_rng_state()
+    paddle.set_cuda_rng_state(rng_state)
+    try:
+        yield
+    finally:
+        paddle.set_cuda_rng_state(orig_cuda_rng_state)
+
+
+class RecomputeFunction(PyLayer):
+    @staticmethod
+    def forward(ctx, run_function, preserve_rng_state, *args):
+        check_recompute_necessary(args)
+
+        # store for recomputing 
+        ctx.run_function = run_function
+        ctx.preserve_rng_state = preserve_rng_state
+
+        # NOTE the number of outputs of backward() should be equal to the number of tensors in forward()'s input
+        # the order of tensors in backward()'s output should be the same as tensors in forward()'s input
+        # None tensor inputs will be filtered in backward inputs.
+
+        # save input for backward
+        ctx.inputs = []
+        ctx.tensor_indices = []
+        tensor_inputs = []
+        for i, arg in enumerate(args):
+            if paddle.is_tensor(arg):
+                tensor_inputs.append(arg)
+                ctx.tensor_indices.append(i)
+                ctx.inputs.append(None)
+            else:
+                ctx.inputs.append(arg)
+        ctx.save_for_backward(*tensor_inputs)
+
+        # NOTE recompute with restore RNG only support one senario where one process for one cuda gpu.
+        # one process with multiple gpu and mix-gpu-cpu senarios are not support
+        if ctx.preserve_rng_state:
+            cur_device = paddle.get_device()
+            if 'gpu:' not in cur_device:
+                raise RuntimeError(
+                    "Recompute with RNG perserve is not support current device: {}.".
+                    format(cur_device))
+            ctx.fw_cuda_rng_state = paddle.get_cuda_rng_state()
+
+        # TODO support AMP
+
+        with paddle.no_grad():
+            outputs = run_function(*args)
+
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        with paddle.fluid.dygraph.guard():
+            # TODO need to check the recompute calling is vaild or not
+
+            # Restore inputs
+            inputs = list(ctx.inputs)
+            tensor_indices = ctx.tensor_indices
+            tensors = ctx.saved_tensor()
+            for i, idx in enumerate(tensor_indices):
+                inputs[idx] = tensors[i]
+
+            # paddle.enable_grad()
+            tracer = framework._dygraph_tracer()
+            tracer._has_grad = True
+
+            # TODO support AMP
+
+            if ctx.preserve_rng_state:
+                with swith_rng_state(ctx.fw_cuda_rng_state):
+                    detached_inputs = detach_variable(tuple(inputs))
+                    outputs = ctx.run_function(*detached_inputs)
+            else:
+                detached_inputs = detach_variable(tuple(inputs))
+                outputs = ctx.run_function(*detached_inputs)
+
+            if isinstance(outputs, core.VarBase):
+                outputs = (outputs, )
+            assert len(outputs) == len(args)
+
+            # run backward() with only tensor that requires grad
+            forward_outputs_with_grad = []
+            backward_inputs = list(args)
+            for i in range(len(outputs)):
+                if isinstance(outputs[i],
+                              core.VarBase) and not outputs[i].stop_gradient:
+                    forward_outputs_with_grad.append(outputs[i])
+            if len(forward_outputs_with_grad) == 0:
+                raise RuntimeError(
+                    "none of output has requires_grad=True, this recompute() is not necessary"
+                )
+
+            assert len(backward_inputs) == len(
+                forward_outputs_with_grad
+            ), "number of forward outputs is [{}], but the backward got [{}] inputs".format(
+                len(forward_outputs_with_grad), len(backward_inputs))
+
+            # actually backward            
+            paddle.autograd.backward(forward_outputs_with_grad, backward_inputs)
+
+            grads = list(inp._grad_ivar() for inp in detached_inputs
+                         if isinstance(inp, core.VarBase))
+
+            return grads
+
+
+def recompute(function, *args, **kwargs):
+    """
+    recompute intermediate activations to save then memory.
+
+    Args:
+        function: layer of sequence of layers that describes part of forward pass of the model whose 
+        intermediate activations will be released to save memory in forward stage and will be recomputed 
+        in backward stage for gradient calculation.
+        preserve_rng_state(bool, optional):  if preserve the RNG state of forward and restore it in backward. 
+        args: inputs to the function
+
+    Returns:
+        Output of function on args
+    """
+    # Hack to mix *args with **kwargs in a python 2.7-compliant way
+    preserve = kwargs.pop('preserve_rng_state', True)
+    if kwargs:
+        raise ValueError("Unexpected keyword arguments: " + ",".join(
+            arg for arg in kwargs))
+
+    return RecomputeFunction.apply(function, preserve, *args)
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch.py
index df3a3407bf5cf6..e02a439025b77f 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
@@ -14,3 +14,5 @@
 
 from paddle.distributed.fleet import launch
 launch.launch()
+
+__all__ = []
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index 582c0be713f4ef..bc042e722947a0 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -15,7 +15,8 @@
 import os
 import six
 import warnings
-from multiprocessing import Process, Manager
+from multiprocessing import Process  # noqa: F401
+from multiprocessing import Manager  # noqa: F401
 import time
 import sys
 
@@ -26,9 +27,11 @@
 from paddle.fluid.framework import _set_expected_place
 from paddle.fluid.dygraph import parallel_helper
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.distributed.fleet.base.private_helper_function import wait_server_ready
+from paddle.distributed.fleet.base.private_helper_function import wait_server_ready  # noqa: F401
 
-__all__ = ["init_parallel_env"]
+__all__ = [  #noqa
+    "init_parallel_env"
+]
 
 ParallelStrategy = core.ParallelStrategy
 
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index bf49604a897e5b..c46672dca09e97 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -21,7 +21,9 @@
 import sys
 import warnings
 
-from paddle.distributed.utils import _print_arguments, _prepare_trainer_env, get_host_name_ip
+from paddle.distributed.utils import _print_arguments
+from paddle.distributed.utils import _prepare_trainer_env
+from paddle.distributed.utils import get_host_name_ip
 from paddle.distributed.cloud_utils import get_cluster_and_pod
 from paddle.distributed.fleet.cloud_utils import use_paddlecloud
 from paddle.device import get_device
@@ -30,6 +32,8 @@
 from paddle.fluid import core
 from paddle.fluid.framework import _cpu_num, set_flags
 
+__all__ = []
+
 
 class ParallelEnvArgs(object):
     def __init__(self):
@@ -325,7 +329,7 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
         func (function): The target function is called by spawned process.
             This function need to be able to pickled, so it must be defined
             at the top level of a module.
-        args (tuple, optional): Arguments passed to ``func``.
+        args (list|tuple, optional): Arguments passed to ``func``.
         nprocs (int, optional): Number of processed to start. Default: -1.
             when nprocs is -1, the available device will be obtained from 
             the environment variable when the model is executed: If use GPU, 
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index f40a7b31b83e6f..e84025c2eb6d20 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -26,6 +26,24 @@
 import socket
 from paddle.fluid import core
 
+__all__ = [     #noqa
+           'get_host_name_ip',
+           'Trainer',
+           'get_cluster',
+           'start_local_trainers',
+           'watch_local_trainers',
+           'find_free_ports',
+           'JobServer',
+           'Cluster',
+           'Pod',
+           'Hdfs',
+           'add_arguments',
+           'terminate_local_procs',
+           'TrainerProc',
+           'get_logger',
+           'pull_worker_log'
+]
+
 logger = logging.getLogger("root")
 logger.propagate = False
 
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index 7f0d71e3877f7c..d866f74b0e8b3b 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -105,7 +105,7 @@ def _to_tensor(self, *args):
         for arg in args:
             if isinstance(arg, float):
                 arg = [arg]
-            if not isinstance(arg, (list, np.ndarray, tensor.Variable)):
+            if not isinstance(arg, (list, tuple, np.ndarray, tensor.Variable)):
                 raise TypeError(
                     "Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}".
                     format(type(arg)))
@@ -190,8 +190,8 @@ class Uniform(Distribution):
     [broadcasting](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/beginners_guide/basic_concept/broadcasting_en.html) (e.g., `high - low` is a valid operation).
 
     Args:
-        low(int|float|list|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
-        high(int|float|list|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
+        low(int|float|list|tuple|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
+        high(int|float|list|tuple|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
         name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
@@ -229,10 +229,10 @@ class Uniform(Distribution):
     def __init__(self, low, high, name=None):
         if not in_dygraph_mode():
             check_type(low, 'low',
-                       (int, float, np.ndarray, tensor.Variable, list),
+                       (int, float, np.ndarray, tensor.Variable, list, tuple),
                        'Uniform')
             check_type(high, 'high',
-                       (int, float, np.ndarray, tensor.Variable, list),
+                       (int, float, np.ndarray, tensor.Variable, list, tuple),
                        'Uniform')
 
         self.all_arg_is_float = False
@@ -409,8 +409,8 @@ class Normal(Distribution):
     * :math:`Z`: is the normalization constant.
 
     Args:
-        loc(int|float|list|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
-        scale(int|float|list|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
+        loc(int|float|list|tuple|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
+        scale(int|float|list|tuple|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
         name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
@@ -451,10 +451,10 @@ class Normal(Distribution):
     def __init__(self, loc, scale, name=None):
         if not in_dygraph_mode():
             check_type(loc, 'loc',
-                       (int, float, np.ndarray, tensor.Variable, list),
+                       (int, float, np.ndarray, tensor.Variable, list, tuple),
                        'Normal')
             check_type(scale, 'scale',
-                       (int, float, np.ndarray, tensor.Variable, list),
+                       (int, float, np.ndarray, tensor.Variable, list, tuple),
                        'Normal')
 
         self.batch_size_unknown = False
@@ -655,7 +655,7 @@ class Categorical(Distribution):
     * :math:`[x=i]` : it evaluates to 1 if :math:`x==i` , 0 otherwise.
 
     Args:
-        logits(list|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
+        logits(list|tuple|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
         name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
@@ -702,11 +702,12 @@ class Categorical(Distribution):
     def __init__(self, logits, name=None):
         """
         Args:
-            logits(list|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
+            logits(list|tuple|numpy.ndarray|Tensor): The logits input of categorical distribution. The data type is float32 or float64.
             name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
         """
         if not in_dygraph_mode():
-            check_type(logits, 'logits', (np.ndarray, tensor.Variable, list),
+            check_type(logits, 'logits',
+                       (np.ndarray, tensor.Variable, list, tuple),
                        'Categorical')
 
         self.name = name if name is not None else 'Categorical'
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 6dd1478dc1f45f..3b73034dfde2e9 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import os
 import sys
+import atexit
 
 # The legacy core need to be removed before "import core",
 # in case of users installing paddlepadde without -U option
@@ -71,7 +72,6 @@
 from .core import LoDTensor, LoDTensorArray, Scope, _Scope
 from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace
 from .incubate import fleet
-from .incubate import data_generator
 from .transpiler import DistributeTranspiler, \
     memory_optimize, release_memory, DistributeTranspilerConfig
 from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
@@ -255,3 +255,8 @@ def __bootstrap__():
 monkey_patch_variable()
 __bootstrap__()
 monkey_patch_varbase()
+
+# NOTE(zhiqiu): register npu_finalize on the exit of Python,
+# do some clean up manually.
+if core.is_compiled_with_npu():
+    atexit.register(core.npu_finalize)
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 572ebb26d73cb4..25412a86a8b940 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -1036,7 +1036,7 @@ def _append_backward_ops_(block,
             val(list) the op path of block(index)
     """
     if callbacks is not None:
-        assert (isinstance(callbacks, list))
+        assert (isinstance(callbacks, (list, tuple)))
         for cb in callbacks:
             if not hasattr(cb, '__call__'):
                 raise ValueError("'callback' must be a callable object.")
@@ -1157,7 +1157,7 @@ def _append_backward_ops_(block,
         new_op_desc._set_attr(op_role_attr_name, backward)
         grad_to_var["__current_op_desc__"] = new_op_desc
         if callbacks is not None:
-            assert (isinstance(callbacks, list))
+            assert (isinstance(callbacks, (list, tuple)))
             for cb in callbacks:
                 cb(block=target_block, context=grad_to_var)
 
@@ -1380,7 +1380,7 @@ def append_backward(loss,
 
     Parameters:
         loss(Tensor): The loss Tensor of the network.
-        parameter_list(list[Tensor|str], optional): List of Parameters or Parameter.names
+        parameter_list(list[Tensor|str]|tuple[Tensor|str], optional): List/Tuple of Parameters or Parameter.names
                                            that need to be updated by optimizers.
                                            If it is None, all parameters
                                            will be updated.
@@ -1391,7 +1391,7 @@ def append_backward(loss,
                                be automatically added into this set.
                                If this parameter is not None, the Tensors or Tensor.names in this set will be added to the default set.
                                Default: None.
-        callbacks(list[callable object], optional): List of callback functions.
+        callbacks(list[callable object]|tuple[callable object], optional): List/Tuple of callback functions.
                                                The callbacks are used for
                                                doing some custom jobs during
                                                backward part building. All
@@ -1477,7 +1477,7 @@ def append_backward(loss,
                       int(core.op_proto_and_checker_maker.OpRole.Loss))
 
     if callbacks is not None:
-        check_type(callbacks, 'callbacks', list,
+        check_type(callbacks, 'callbacks', (list, tuple),
                    'paddle.static.append_backward')
 
     program = loss.block.program
@@ -1823,9 +1823,9 @@ def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
     Backpropagate the gradients of targets to inputs.
 
     Args:
-        targets(Tensor|list[Tensor]): The target Tensors
-        inputs(Tensor|list[Tensor]): The input Tensors
-        target_gradients (Tensor|list[Tensor], optional): The gradient Tensors
+        targets(Tensor|list[Tensor]|tuple[Tensor]): The target Tensors
+        inputs(Tensor|list[Tensor]|tuple[Tensor]): The input Tensors
+        target_gradients (Tensor|list[Tensor]|tuple[Tensor], optional): The gradient Tensors
             of targets which has the same shape with targets, If None, ones will
             be created for them.
         no_grad_set(set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
@@ -1962,9 +1962,9 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
     Backpropagate the gradients of targets to inputs.
 
     Args:
-        targets (Tensor|list[Tensor]): The target Tensors.
-        inputs (Tensor|list[Tensor]): The input Tensors.
-        target_gradients (Tensor|list[Tensor], optional): The gradient Tensor
+        targets (Tensor|list[Tensor]|tuple[Tensor]): The target Tensors.
+        inputs (Tensor|list[Tensor]|tuple[Tensor]): The input Tensors.
+        target_gradients (Tensor|list[Tensor]|tuple[Tensor], optional): The gradient Tensor
             of targets which has the same shape with targets, If None, ones will
             be created for them.
         no_grad_set (set[Tensor|str], optional): Set of Tensors or Tensor.names in the :ref:`api_guide_Block_en` 0 whose gradients
@@ -1992,12 +1992,12 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None):
             z = paddle.static.gradients([y], x)
             print(z) # [var x@GRAD : fluid.VarType.LOD_TENSOR.shape(-1L, 2L, 8L, 8L).astype(VarType.FP32)]
     """
-    check_type(targets, 'targets', (framework.Variable, list),
+    check_type(targets, 'targets', (framework.Variable, list, tuple),
                'paddle.static.gradients')
-    check_type(inputs, 'inputs', (framework.Variable, list),
+    check_type(inputs, 'inputs', (framework.Variable, list, tuple),
                'paddle.static.gradients')
     check_type(target_gradients, 'target_gradients', (
-        framework.Variable, list, type(None)), 'paddle.static.gradients')
+        framework.Variable, list, tuple, type(None)), 'paddle.static.gradients')
 
     outs = calc_gradient(targets, inputs, target_gradients, no_grad_set)
     return _as_list(outs)
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index df41e649ca8cbe..30981f531289ae 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -22,11 +22,7 @@
 from .op_frequence import *
 from . import quantize
 from .quantize import *
-from . import reader
-from .reader import *
 from . import slim
-from . import utils
-from .utils import *
 from . import extend_optimizer
 from .extend_optimizer import *
 from . import model_stat
@@ -42,8 +38,6 @@
 __all__ += memory_usage_calc.__all__
 __all__ += op_frequence.__all__
 __all__ += quantize.__all__
-__all__ += reader.__all__
-__all__ += utils.__all__
 __all__ += extend_optimizer.__all__
 __all__ += ['mixed_precision']
 __all__ += layers.__all__
diff --git a/python/paddle/fluid/contrib/mixed_precision/__init__.py b/python/paddle/fluid/contrib/mixed_precision/__init__.py
index 571b755b50d2ab..a580ae5574c35b 100644
--- a/python/paddle/fluid/contrib/mixed_precision/__init__.py
+++ b/python/paddle/fluid/contrib/mixed_precision/__init__.py
@@ -20,10 +20,7 @@
 from .fp16_lists import *
 from . import fp16_utils
 from .fp16_utils import *
-from . import bf16
-from .bf16 import *
 
 __all__ = decorator.__all__
 __all__ += fp16_lists.__all__
 __all__ += fp16_utils.__all__
-__all__ += bf16.__all__
diff --git a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
index 3bfc078971d7a4..588eb2a29f555a 100644
--- a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
+++ b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
@@ -20,7 +20,7 @@
 __all__ = ['check_finite_and_unscale', 'update_loss_scaling']
 
 
-def check_finite_and_unscale(x, scale, name=None):
+def check_finite_and_unscale(x, scale, name=None, float_status=None):
     """
     Check if input X contains all finite data, if yes, scale it by input Scale.
 
@@ -30,9 +30,11 @@ def check_finite_and_unscale(x, scale, name=None):
     FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
     Out should not be used, and its data may not be deterministic. 
     Otherwise, FoundInfinite will be 0 (False).
+
     Args:
         x(list|tuple): The input tensors of check_finite_and_unscale operator.
         scale: The scale of check_finite_and_unscale operator.
+        float_status(Tensor): (Only used on NPU) The float status to check overflow.
     """
     check_type(x, 'x', (tuple, list), 'check_finite_and_unscale')
     for e in x:
@@ -43,6 +45,11 @@ def check_finite_and_unscale(x, scale, name=None):
     found_inf = helper.create_variable_for_type_inference(dtype='bool')
 
     inputs = {'X': x, 'Scale': scale}
+    if core.is_compiled_with_npu():
+        check_variable_and_dtype(float_status, "float_status",
+                                 ['float16', 'float32'],
+                                 'check_finite_and_unscale')
+        inputs['FloatStatus'] = float_status
     outputs = {'Out': x, 'FoundInfinite': found_inf}
     helper.append_op(
         type='check_finite_and_unscale', inputs=inputs, outputs=outputs)
diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py b/python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py
index 8c05bc4899cf7d..d3632729a3b02a 100644
--- a/python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py
@@ -18,7 +18,9 @@
 from .amp_lists import *
 from . import amp_utils
 from .amp_utils import *
+from . import decorator
+from .decorator import *
 
-__all__ = []
+__all__ = decorator.__all__
 __all__ += amp_lists.__all__
 __all__ += amp_utils.__all__
diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py
index 81dc32d114b146..3a4dc8ed9afcc4 100644
--- a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 import copy
+from paddle.fluid import core
+
 from ..fp16_lists import white_list as white_list_fp16, black_list as black_list_fp16,\
-    gray_list as gray_list_fp16, unsupported_fp16_list
+    gray_list as gray_list_fp16
 
 __all__ = ["AutoMixedPrecisionListsBF16"]
 
@@ -47,6 +49,7 @@ def __init__(self,
         self.bf16_list = copy.copy(bf16_list)
         self.fp32_list = copy.copy(fp32_list)
         self.gray_list = copy.copy(gray_list)
+        self.bf16_initializer_list = copy.copy(bf16_initializer_list)
         self.unsupported_list = copy.copy(unsupported_list)
         self.fp32_varnames = copy.copy(custom_fp32_varnames)
         self._update_list()
@@ -77,16 +80,24 @@ def _update_list(self):
                 self.unsupported_list.add(op_name)
 
 
+bf16_initializer_list = {'fill_constant', 'uniform_random'}
+
 # always bf16
 bf16_list = {'elementwise_add', }
 
 # depends on the prev_op type
 gray_list = {
+    'cast',
+    'fill_constant',
+    'reduce_mean',
     'reshape2',
-    'lookup_table',
+    'scale',
 }
 
-unsupported_list = unsupported_fp16_list.copy().copy()
+_, _, _sys_unsupported_bf16_list = core.op_supported_infos(
+    'CPU', core.VarDesc.VarType.BF16)
+unsupported_list = _sys_unsupported_bf16_list
+
 fp32_list = black_list_fp16.copy().copy()
 fp32_list |= white_list_fp16
 fp32_list |= gray_list_fp16
diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py
index c2c01f88c74314..4551947e0fad24 100644
--- a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py
@@ -14,18 +14,25 @@
 # limitations under the License.
 
 from __future__ import print_function
-import struct
 
 from .... import core
 from .... import framework
+from .... import global_scope
 from ....log_helper import get_logger
 from ....wrapped_decorator import signature_safe_contextmanager
 from .amp_lists import AutoMixedPrecisionListsBF16
-from ..fp16_utils import find_true_prev_op, find_true_post_op, _rename_arg, find_op_index
+from ..fp16_utils import find_true_prev_op, find_true_post_op, _rename_arg, \
+    find_op_index, _rename_op_input
+
+import collections
+import struct
 import logging
 import numpy as np
 
-__all__ = ["bf16_guard", "rewrite_program_bf16", "convert_float_to_uint16"]
+__all__ = [
+    "bf16_guard", "rewrite_program_bf16", "cast_model_to_bf16",
+    "cast_parameters_to_bf16", "convert_float_to_uint16"
+]
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
@@ -126,7 +133,41 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
     return num_cast_ops
 
 
+def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name,
+                         op_var_rename_map):
+    num_cast_ops = 0
+    target_var = block.var(target_name)
+    if target_var.type not in _valid_types or target_var.dtype == dest_dtype:
+        return num_cast_ops
+
+    assert target_var.dtype == src_dtype, \
+        "The real dtype({}) is not equal to the src dtype({})".format(_dtype_to_str(target_var.dtype), _dtype_to_str(src_dtype))
+
+    cast_name = target_var.name + '.cast_' + _dtype_to_str(dest_dtype)
+    cast_var = block.vars.get(cast_name)
+    if cast_var is None or cast_var.dtype != dest_dtype:
+        cast_var = block.create_var(
+            name=cast_name,
+            dtype=dest_dtype,
+            persistable=False,
+            stop_gradient=target_var.stop_gradient)
+        block._insert_op(
+            idx,
+            type="cast",
+            inputs={"X": target_var},
+            outputs={"Out": cast_var},
+            attrs={"in_dtype": target_var.dtype,
+                   "out_dtype": cast_var.dtype})
+        num_cast_ops += 1
+        op_var_rename_map[block.idx][target_var.name] = cast_var.name
+
+    return num_cast_ops
+
+
 def _is_in_fp32_varnames(op, amp_lists):
+    if not amp_lists.fp32_varnames:
+        return False
+
     for in_name in op.input_arg_names:
         if in_name in amp_lists.fp32_varnames:
             return True
@@ -191,7 +232,223 @@ def bf16_guard():
         yield
 
 
-def rewrite_program_bf16(main_prog, amp_lists=None, use_bf16_guard=False):
+def are_post_ops_bf16(post_ops, keep_fp32_ops):
+    for post_op in post_ops:
+        for op in post_op:
+            if op.type in keep_fp32_ops:
+                return False
+    return True
+
+
+def cast_initializers_to_bf16(startup_prog,
+                              amp_lists,
+                              block,
+                              all_ops,
+                              keep_fp32_ops,
+                              to_bf16_var_names=None):
+    prepend_ops = startup_prog.global_block().ops
+    for op in prepend_ops:
+        if str(op.type) in amp_lists.bf16_initializer_list:
+            change_op = True
+            op_post_ops = []
+            op_out_vars = []
+            for out_name in op.output_names:
+                for out_var_name in op.output(out_name):
+                    out_var = block.var(out_var_name)
+                    post_op = find_true_post_op(all_ops, op, out_var_name, True)
+
+                    if out_var is None or out_var.type not in _valid_types:
+                        change_op = False
+                        break
+                    op_post_ops.append(post_op)
+                    op_out_vars.append(out_var)
+
+            if change_op and are_post_ops_bf16(op_post_ops, keep_fp32_ops):
+                for out_var in op_out_vars:
+                    if out_var.dtype == core.VarDesc.VarType.FP32:
+                        out_var.desc.set_dtype(core.VarDesc.VarType.BF16)
+                    if to_bf16_var_names is not None and out_var.name in to_bf16_var_names:
+                        to_bf16_var_names.remove(out_var.name)
+                if op.has_attr('dtype') and op.attr(
+                        'dtype') == core.VarDesc.VarType.FP32:
+                    op._set_attr('dtype', core.VarDesc.VarType.BF16)
+
+
+def cast_model_to_bf16(program,
+                       startup_prog=None,
+                       amp_lists=None,
+                       use_bf16_guard=True):
+    """
+    Traverse all ops in the whole model and set their inputs and outputs
+    to the bf16 data type. This function will do some special processing for
+    the batch normalization, which will keep the batchnorm's computations in FP32.
+    Args:
+        program (Program): The used program.
+        amp_lists (AutoMixedPrecisionListsBF16): An AutoMixedPrecisionListsBF16 object.
+        use_bf16_guard(bool): Determine whether to use `bf16_guard` when
+                              constructing the program. Default True.
+    """
+
+    if amp_lists is None:
+        amp_lists = AutoMixedPrecisionListsBF16()
+    global_block = program.global_block()
+    keep_fp32_ops = set()
+    to_bf16_var_names = set()
+    to_bf16_pre_cast_ops = set()
+    origin_ops = []
+    for block in program.blocks:
+        origin_ops.extend(block.ops)
+
+    for block in program.blocks:
+        ops = block.ops
+        for op in ops:
+            if op.type == 'create_py_reader' or op.type == 'read':
+                continue
+            if _need_keep_fp32(op, amp_lists.unsupported_list, use_bf16_guard):
+                keep_fp32_ops.add(op)
+                continue  # processed below
+            for in_name in op.input_names:
+                if op.type in {
+                        'batch_norm', 'fused_bn_add_activation', 'layer_norm'
+                } and in_name not in {'X', 'Z'}:
+                    continue
+                for in_var_name in op.input(in_name):
+                    in_var = None
+                    try:
+                        in_var = block.var(in_var_name)
+                    except ValueError as e:
+                        _logger.debug(
+                            "-- {}, try to get it in the global block --".
+                            format(e))
+                        in_var = global_block.var(in_var_name)
+                        if in_var is not None:
+                            _logger.debug(
+                                "-- var {} is got in the global block --".
+                                format(in_var_name))
+
+                    if in_var is None or in_var.type not in _valid_types:
+                        continue
+
+                    if in_var.dtype == core.VarDesc.VarType.FP32:
+                        in_var.desc.set_dtype(core.VarDesc.VarType.BF16)
+                        to_bf16_var_names.add(in_var_name)
+
+                    _logger.debug(
+                        "-- op type: {}, in var name: {}, in var dtype: {} --".
+                        format(op.type, in_var_name, in_var.dtype))
+
+            for out_name in op.output_names:
+                if op.type in {
+                        'batch_norm', 'fused_bn_add_activation', 'layer_norm'
+                } and out_name != 'Y':
+                    continue
+                for out_var_name in op.output(out_name):
+                    out_var = None
+                    try:
+                        out_var = block.var(out_var_name)
+                    except ValueError as e:
+                        _logger.debug(
+                            "-- {}, try to get it in the global block --".
+                            format(e))
+                        out_var = global_block.var(out_var_name)
+                        if out_var is not None:
+                            _logger.debug(
+                                "-- var {} is got in the global block --".
+                                format(out_var_name))
+
+                    if out_var is None or out_var.type not in _valid_types:
+                        continue
+
+                    if out_var.dtype == core.VarDesc.VarType.FP32:
+                        out_var.desc.set_dtype(core.VarDesc.VarType.BF16)
+
+                    _logger.debug(
+                        "-- op type: {}, out var name: {}, out var dtype: {} --".
+                        format(op.type, out_var_name, out_var.dtype))
+            for attr_name in ['in_dtype', 'out_dtype', 'dtype']:
+                if op.has_attr(attr_name) and op.attr(
+                        attr_name) == core.VarDesc.VarType.FP32:
+                    op._set_attr(attr_name, core.VarDesc.VarType.BF16)
+            if op.has_attr('use_mkldnn'):
+                op._set_attr('use_mkldnn', True)
+            if op.has_attr('mkldnn_data_type'):
+                op._set_attr('mkldnn_data_type', 'bfloat16')
+
+        if startup_prog is not None:
+            cast_initializers_to_bf16(startup_prog, amp_lists, global_block,
+                                      ops, keep_fp32_ops, to_bf16_var_names)
+
+    # process ops in keep_fp32_ops
+    op_var_rename_map = [
+        collections.OrderedDict() for _ in range(len(program.blocks))
+    ]
+    for block in program.blocks:
+        ops = block.ops
+        idx = 0
+        while idx < len(ops):
+            op = ops[idx]
+            num_cast_ops = 0
+            if op not in keep_fp32_ops:
+                if op in to_bf16_pre_cast_ops:
+                    in_var_cast_num = _insert_cast_op(block, op, idx,
+                                                      core.VarDesc.VarType.FP32,
+                                                      core.VarDesc.VarType.BF16)
+                    num_cast_ops += in_var_cast_num
+            else:
+                pre_cast_num = _insert_cast_op(block, op, idx,
+                                               core.VarDesc.VarType.BF16,
+                                               core.VarDesc.VarType.FP32)
+                num_cast_ops += pre_cast_num
+                for out_var_name in op.output_arg_names:
+                    out_var = block.vars.get(out_var_name)
+                    if out_var is None or out_var.type not in _valid_types:
+                        continue
+                    if out_var.dtype == core.VarDesc.VarType.BF16:
+                        out_var.desc.set_dtype(core.VarDesc.VarType.FP32)
+                        post_ops = find_true_post_op(ops, op, out_var_name)
+                        for post_op in post_ops:
+                            if post_op in keep_fp32_ops:
+                                continue
+                            post_cast_num = _insert_cast_post_op(
+                                block, op, idx + pre_cast_num + 1,
+                                core.VarDesc.VarType.FP32,
+                                core.VarDesc.VarType.BF16, out_var_name,
+                                op_var_rename_map)
+                            num_cast_ops += post_cast_num
+            idx += num_cast_ops + 1
+
+    _rename_op_input(program, op_var_rename_map, origin_ops, keep_fp32_ops)
+    return to_bf16_var_names
+
+
+def cast_parameters_to_bf16(place, program, scope=None, to_bf16_var_names=None):
+    """
+    Traverse all parameters in the whole model and set them to the BF16 data type.
+    Whereas, this function will keep parameters of batchnorms in FP32.
+    Args:
+        place(fluid.CPUPlace|fluid.CUDAPlace): `place` is used to restore the BF16 weight tensors.
+        program (Program): The used program.
+        scope(fluid.Scope, optional): `scope` is used to get the FP32 weight tensor values.
+                                      Default is None.
+        to_bf16_var_names(set|list, optional): The data types of vars in `to_bf16_var_names`
+                                               will be set to BF16. Usually, it is the returned
+                                               value of `cast_model_to_bf16` API.
+    """
+    all_parameters = []
+    for block in program.blocks:
+        all_parameters.extend(block.all_parameters())
+
+    bf16_var_names = to_bf16_var_names if to_bf16_var_names else set()
+    var_scope = scope if scope else global_scope()
+    for param in all_parameters:
+        if param.name in bf16_var_names:
+            _logger.debug("---- cast {} to bf16 dtype ----".format(param.name))
+            param_t = var_scope.find_var(param.name).get_tensor()
+            data = np.array(param_t)
+            param_t.set(convert_float_to_uint16(data), place)
+
+
+def rewrite_program_bf16(main_prog, amp_lists=None):
     """
     Traverse all ops in current block and insert cast op according to
     which set current op belongs to.
@@ -231,8 +488,7 @@ def rewrite_program_bf16(main_prog, amp_lists=None, use_bf16_guard=False):
             fp32_op_set.add(op)
             continue
 
-        if op.type in amp_lists.fp32_list or _need_keep_fp32(
-                op, amp_lists.unsupported_list, use_bf16_guard):
+        if op.type in amp_lists.fp32_list:
             fp32_op_set.add(op)
         elif op.type in amp_lists.bf16_list:
             bf16_op_set.add(op)
diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py b/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py
new file mode 100644
index 00000000000000..32c8a1c3544c22
--- /dev/null
+++ b/python/paddle/fluid/contrib/mixed_precision/bf16/decorator.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid import (core, default_main_program, layers, program_guard,
+                          unique_name)
+from .amp_utils import (rewrite_program_bf16, cast_model_to_bf16,
+                        cast_parameters_to_bf16)
+from .amp_lists import AutoMixedPrecisionListsBF16
+import types
+import warnings
+
+__all__ = ["decorate_bf16"]
+
+
+class OptimizerWithMixedPrecision(object):
+    """
+    Optimizer with mixed-precision (MP) training. This is a wrapper of a common 
+    optimizer, plus the support of mixed-precision pre-training. The object
+    of this class almost has the same behavior as the common optimizer, with the 
+    methods `minimize()`, `backward()`, `apply_gradients()` implemented. 
+    Additionally, it enables the MP training automatically, i.e, the creation 
+    and maintenance of master parameters, scaling of loss, etc.
+
+    Args:
+        optimizer (Optimizer): A common Optimizer object.
+        amp_lists (CustomOpLists): An CustomOpLists object.
+        use_pure_bf16(bool): Whether to use the pure bf16 training.
+        use_bf16_guard(bool): Whether to use `bf16_guard` when constructing the program.
+
+    """
+
+    def __init__(self, optimizer, amp_lists, use_pure_bf16, use_bf16_guard):
+        self._optimizer = optimizer
+        self._amp_lists = amp_lists
+        self._param_grads = None
+        self._train_program = None
+
+        self._learning_rate = optimizer._learning_rate
+        self._learning_rate_map = optimizer._learning_rate_map
+        self._use_pure_bf16 = use_pure_bf16
+        self._use_bf16_guard = use_bf16_guard
+        self._to_bf16_var_names = None
+
+    def _init_amp_var(self):
+        # Ensure the data type of learning rate vars is float32 (same as the
+        # master parameter dtype)
+        if isinstance(self._optimizer._learning_rate, float):
+            self._optimizer._learning_rate_map[default_main_program()] = \
+                    layers.create_global_var(
+                    name=unique_name.generate("learning_rate"),
+                    shape=[1],
+                    value=float(self._optimizer._learning_rate),
+                    dtype='float32',
+                    persistable=True)
+
+    def backward(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None,
+                 callbacks=None):
+        """
+        Backward propagation or auto differentiation for gradients' computation.
+
+        Args:
+            loss (Variable): The loss Variable to minimize.
+            startup_program (Program|None): The startup Program for initializing 
+                                       parameters in `parameter_list`.
+            parameter_list (list|None): A list of Variables to update.
+            no_grad_set (set|None): A set of Variables should be ignored.
+            callbacks (list|None): A list of callable objects to run when appending
+                                   backward operator for one parameter.
+
+        Returns:
+            A list of (param, grad), which is a tuple of a parameter and its 
+            gradient respectively, and the scaled loss.
+        """
+        train_program = loss.block.program
+        self._train_program = train_program
+
+        with program_guard(self._train_program, startup_program):
+            self._init_amp_var()
+
+            if self._use_pure_bf16:
+                self._to_bf16_var_names = cast_model_to_bf16(
+                    self._train_program, startup_program, self._amp_lists,
+                    self._use_bf16_guard)
+            else:
+                rewrite_program_bf16(self._train_program, self._amp_lists)
+
+            if loss.dtype != core.VarDesc.VarType.FP32:
+                loss = loss.astype('float32')
+
+            params_grads = self._optimizer.backward(
+                loss, startup_program, parameter_list, no_grad_set, callbacks)
+        return params_grads
+
+    def amp_init(self,
+                 place,
+                 scope=None,
+                 test_program=None,
+                 use_bf16_test=False):
+        """
+        Init the amp training, such as cast fp32 parameters to bf16 type.
+  
+        Args:
+            place(CPUPlace): place is used to initialize 
+                bf16 parameters with fp32 values.
+            scope(Scope): The scope is used to find fp32 parameters.
+            test_program(Program): The program is used for testing.
+            use_bf16_test(bool): Whether to use bf16 testing.
+
+        Examples:
+            .. code-block:: python
+
+                import numpy as np
+                import paddle
+                import paddle.nn.functional as F
+                paddle.enable_static()
+
+                def run_example_code():
+                    place = paddle.CPUPlace(0)
+                    exe = paddle.static.Executor(place)
+                    data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
+                    conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
+                    # 1) Use bf16_guard to control the range of bf16 kernels used.
+                    with paddle.static.amp.bf16_guard():
+                        bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
+                        pool = F.max_pool2d(bn, kernel_size=2, stride=2)
+                        hidden = paddle.static.nn.fc(pool, size=10)
+                        loss = paddle.mean(hidden)
+                    # 2) Create the optimizer and set `multi_precision` to True.
+                    # Setting `multi_precision` to True can avoid the poor accuracy
+                    # or the slow convergence in a way. 
+                    optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
+                    # 3) These ops in `custom_fp32_list` will keep in the float32 computation type.
+                    amp_list = paddle.static.amp.CustomOpLists(
+                        custom_fp32_list=['pool2d'])
+                    # 4) The entry of Paddle AMP.
+                    # Enable pure bf16 training by setting `use_pure_bf16` to True.
+                    optimizer = paddle.static.amp.bf16.decorate_bf16(
+                        optimizer,
+                        amp_list,
+                        use_pure_bf16=True)
+                    # If you don't use the default_startup_program(), you sholud pass
+                    # your defined `startup_program` into `minimize`.
+                    optimizer.minimize(loss)
+                    exe.run(paddle.static.default_startup_program())
+                    # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
+                    # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
+                    optimizer.amp_init(place, scope=paddle.static.global_scope())
+                    
+        """
+        assert self._train_program is not None, \
+            "Please call the minimize method first."
+        if self._use_pure_bf16:
+            cast_parameters_to_bf16(place, self._train_program, scope,
+                                    self._to_bf16_var_names)
+        if test_program is not None:
+            if self._use_pure_bf16:
+                cast_model_to_bf16(
+                    test_program,
+                    amp_lists=self._amp_lists,
+                    use_bf16_guard=self._use_bf16_guard)
+            elif use_bf16_test:
+                rewrite_program_bf16(test_program, amp_lists=self._amp_lists)
+
+    def apply_gradients(self, params_grads):
+        """
+        Apply gradients.
+  
+        Args:
+            params_grads (list): A list of params.
+    
+        Returns:
+            A list of optimize operators.
+        """
+
+        return self._optimizer.apply_gradients(params_grads)
+
+    def apply_optimize(self, loss, startup_program, params_grads):
+        program = loss.block.program
+        with program_guard(program, startup_program):
+            optimize_ops = self.apply_gradients(params_grads)
+        return optimize_ops
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        """
+        Perform optimization by minimizing the given loss.
+
+        Args:
+            loss (Variable): The loss Variable.
+            startup_program (Program): startup_program for initializing parameters
+                in `parameter_list`.
+            parameter_list (list): list of Variables to update.
+            no_grad_set (set|None): set of Variables should be ignored.
+
+        Returns:
+            The scaled loss by scaling factor, the list of optimize ops, and a
+            list of scaled parameters and gradients.
+        """
+        opt_dict = self._optimizer.__class__.__dict__
+        if 'minimize' in opt_dict and isinstance(opt_dict['minimize'],
+                                                 types.FunctionType):
+            warnings.warn(
+                "The decorated optimizer has its own `minimize` method, but it will not be executed."
+            )
+
+        params_grads = self.backward(
+            loss,
+            startup_program=startup_program,
+            parameter_list=parameter_list,
+            no_grad_set=no_grad_set)
+
+        optimize_ops = self.apply_optimize(loss, startup_program, params_grads)
+
+        return optimize_ops, params_grads
+
+
+def decorate_bf16(optimizer,
+                  amp_lists=None,
+                  use_pure_bf16=False,
+                  use_bf16_guard=None):
+    """ 
+    Decorate the given optimizer to adapt to the mixed-precision training.
+
+    Args:
+        optimizer(Optimizer): A common Optimizer.
+        amp_lists (CustomOpLists): An CustomOpLists object.
+        use_pure_bf16(bool): Whether to use the pure bf16 training. Default False.
+        use_bf16_guard(bool): Whether to use `bf16_guard` when constructing the program.
+                           Default None, which means that its value equals to `use_pure_bf16`.
+
+    Returns:
+        An optimizer acting like a normal one but with mixed-precision training 
+        enabled.
+
+    Examples 1:
+	    .. code-block:: python
+
+            # fp32&bf16 list based strategy example
+            import paddle
+            import paddle.static as static
+
+            paddle.enable_static()
+
+            data = static.data(name='X', shape=[None, 1], dtype='float32')
+            hidden = static.nn.fc(x=data, size=10)
+            loss = paddle.mean(hidden)
+            optimizer = paddle.optimizer.Adam(learning_rate=0.001)
+
+            mp_optimizer = static.amp.decorate_bf16(optimizer=optimizer)
+
+            ops, param_grads = mp_optimizer.minimize(loss)
+
+    Examples 2:
+        .. code-block:: python
+
+            # pure bf16 training example
+            import numpy as np
+            import paddle
+            import paddle.nn.functional as F
+
+            def run_example_code():
+                place = paddle.CPUPlace(0)
+                exe = paddle.static.Executor(place)
+                data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32')
+                conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3)
+                # 1) Use bf16_guard to control the range of bf16 kernels used.
+                with paddle.static.amp.bf16_guard():
+                    bn = paddle.static.nn.batch_norm(input=conv2d, act="relu")
+                    pool = F.max_pool2d(bn, kernel_size=2, stride=2)
+                    hidden = paddle.static.nn.fc(pool, size=10)
+                    loss = paddle.mean(hidden)
+                # 2) Create the optimizer and set `multi_precision` to True.
+                # Setting `multi_precision` to True can avoid the poor accuracy
+                # or the slow convergence in a way. 
+                optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
+                # 3) These ops in `custom_fp32_list` will keep in the float32 computation type.
+                amp_list = paddle.static.amp.CustomOpLists(
+                    custom_fp32_list=['pool2d'])
+                # 4) The entry of Paddle AMP.
+                # Enable pure bf16 training by setting `use_pure_bf16` to True.
+                optimizer = paddle.static.amp.decorate_bf16(
+                    optimizer,
+                    amp_list,
+                    use_pure_bf16=True)
+                # If you don't use the default_startup_program(), you sholud pass
+                # your defined `startup_program` into `minimize`.
+                optimizer.minimize(loss)
+                exe.run(paddle.static.default_startup_program())
+                # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
+                # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
+                optimizer.amp_init(place, scope=paddle.static.global_scope())
+                
+    """
+    if amp_lists is None:
+        amp_lists = AutoMixedPrecisionListsBF16()
+
+    if use_bf16_guard is None:
+        use_bf16_guard = use_pure_bf16
+
+    mp_optimizer = OptimizerWithMixedPrecision(optimizer, amp_lists,
+                                               use_pure_bf16, use_bf16_guard)
+
+    return mp_optimizer
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index d37e90b4695d03..3cb9fe75559b16 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -29,6 +29,7 @@
 from .amp_nn import update_loss_scaling
 import types
 import warnings
+import paddle
 
 __all__ = ["decorate"]
 
@@ -98,6 +99,7 @@ def _set_distributed(self, flag):
     def get_loss_scaling(self):
         """Return the real-time loss scaling factor.
         """
+        assert self._loss_scaling is not None, 'Please call minimize() before calling get_loss_scaling().'
         return self._loss_scaling
 
     def get_scaled_loss(self):
@@ -164,6 +166,17 @@ def backward(self,
         train_program = loss.block.program
         self._train_program = train_program
 
+        # NOTE(zhiqiu): _float_status is only used for NPU.
+        if core.is_compiled_with_npu():
+            float_status = paddle.static.data(
+                name="float_status", shape=[8], dtype='float32')
+            self._train_program.global_block().append_op(
+                type="alloc_float_status",
+                outputs={"FloatStatus": float_status}, )
+            self._float_status = float_status
+        else:
+            self._float_status = None
+
         with program_guard(self._train_program, startup_program):
             self._init_amp_var()
 
@@ -293,7 +306,10 @@ def apply_gradients(self, params_grads):
             for p, g in params_grads:
                 with self._train_program._optimized_guard([p, g]):
                     _, found_inf = check_finite_and_unscale(
-                        [g, ], self._loss_scaling, name="find_infinite_scale")
+                        [g, ],
+                        self._loss_scaling,
+                        name="find_infinite_scale",
+                        float_status=self._float_status)
                     found_infs.append(found_inf)
         elif self._use_pure_fp16:
             if fp32_grads:
@@ -301,19 +317,24 @@ def apply_gradients(self, params_grads):
                     _, fp32_found_inf = check_finite_and_unscale(
                         fp32_grads,
                         self._loss_scaling,
-                        name="find_infinite_scale_fp32")
+                        name="find_infinite_scale_fp32",
+                        float_status=self._float_status)
                 found_infs.append(fp32_found_inf)
             if fp16_grads:
                 with self._train_program._optimized_guard(fp16_grads):
                     _, fp16_found_inf = check_finite_and_unscale(
                         fp16_grads,
                         self._loss_scaling,
-                        name="find_infinite_scale_fp16")
+                        name="find_infinite_scale_fp16",
+                        float_status=self._float_status)
                 found_infs.append(fp16_found_inf)
         else:
             with self._train_program._optimized_guard(grads):
                 _, found_inf = check_finite_and_unscale(
-                    grads, self._loss_scaling, name="find_infinite_scale")
+                    grads,
+                    self._loss_scaling,
+                    name="find_infinite_scale",
+                    float_status=self._float_status)
 
         if self._use_dynamic_loss_scaling:
             if self._is_distributed or self._use_pure_fp16:
@@ -393,6 +414,7 @@ def minimize(self,
             The scaled loss by scaling factor, the list of optimize ops, and a
             list of scaled parameters and gradients.
         """
+
         opt_dict = self._optimizer.__class__.__dict__
         if 'minimize' in opt_dict and isinstance(opt_dict['minimize'],
                                                  types.FunctionType):
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index cf35963fe2b261..16dfb2bd50c141 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -103,7 +103,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
             if in_name not in {'X', 'Z'}:
                 continue
         for in_var_name in op.input(in_name):
-            in_var = block.var(in_var_name)
+            in_var = block._find_var_recursive(in_var_name)
             if in_var.type not in _valid_types or in_var.dtype == dest_dtype:
                 continue
             if in_var.dtype == src_dtype:
@@ -157,7 +157,8 @@ def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name,
         return num_cast_ops
 
     assert target_var.dtype == src_dtype, \
-           "The real dtype({}) is not equal to the src dtype({})".format(_dtype_to_str(target_var.dtype), _dtype_to_str(src_dtype))
+        "The real dtype({}) is not equal to the src dtype({})".format(
+            _dtype_to_str(target_var.dtype), _dtype_to_str(src_dtype))
 
     cast_name = target_var.name + '.cast_' + _dtype_to_str(dest_dtype)
     cast_var = block.vars.get(cast_name)
@@ -209,7 +210,7 @@ def find_true_prev_op(ops, cur_op, var_name):
     return None
 
 
-def find_true_post_op(ops, cur_op, var_name):
+def find_true_post_op(ops, cur_op, var_name, search_all=False):
     """
     if there are post ops, return them, if there is no post op,
     return None instead.
@@ -217,11 +218,22 @@ def find_true_post_op(ops, cur_op, var_name):
         ops (list): A list of ops.
         cur_op (Operator): Current operator which has var_name variable.
         var_name (string): Variable name.
+        search_all (bool): The type of operator search. Use if \"cur_op\" is not in the \"ops\" set. 
     """
     post_op = []
-    for idx, op in enumerate(ops):
-        if op == cur_op:
-            break
+    if search_all:
+        """
+        \"cur_op\" do not have to be in list of \"ops\". E.g. \"cur_op\" can come 
+        from startup_prog block and \"ops\" list from main_prog block. 
+        By setting idx to -1, we'll start looking for post-ops from the top of the list. 
+        If search_all is False, assume that \"cur_op\" is in \"ops\" list, 
+        so to reduce the time of search we can start iterating from \"cur_op\" idx. 
+        """
+        idx = -1
+    else:
+        for idx, op in enumerate(ops):
+            if op == cur_op:
+                break
 
     for i in range(idx + 1, len(ops)):
         op = ops[i]
@@ -270,7 +282,7 @@ def _need_keep_fp32(op, unsupported_op_list, use_fp16_guard):
 
     if use_fp16_guard:
         if op.has_attr("op_namescope") and \
-            (_fp16_guard_pattern in op.attr("op_namescope")):
+                (_fp16_guard_pattern in op.attr("op_namescope")):
             # op in fp16 guard
             return False
         else:
@@ -496,8 +508,8 @@ def rewrite_program(main_prog, amp_lists):
     black_op_set = set()
     for op in ops:
 
-        # NOTE(zhiqiu): 'create_py_reader' and 'read' is used in non-iterable DataLoder, 
-        # we don't need to handle reader op and the input of 'create_py_reader' is not 
+        # NOTE(zhiqiu): 'create_py_reader' and 'read' is used in non-iterable DataLoder,
+        # we don't need to handle reader op and the input of 'create_py_reader' is not
         # in block, which may result in errors.
         # See GeneratorLoader._init_non_iterable() for details.
         if op.type == 'create_py_reader' or op.type == 'read':
@@ -612,7 +624,7 @@ def update_role_var_grad(main_prog, params_grads):
                 raise ValueError("The cast op {0}'s output should not be"
                                  "used by a non-optimize op, however, it"
                                  "is used by {1}".format(op, post_ops[0]))
-            #add new op in the python and cpp at the same time 
+            # add new op in the python and cpp at the same time
             new_op_desc = block.desc.append_op()
             new_op_desc.copy_from(op.desc)
             new_op = framework.Operator(
diff --git a/python/paddle/fluid/contrib/reader/README.md b/python/paddle/fluid/contrib/reader/README.md
deleted file mode 100644
index f043a17493ec2b..00000000000000
--- a/python/paddle/fluid/contrib/reader/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-## CTR READER
-
-An multi-thread cpp reader that has the same interface with py_reader. It
-uses cpp multi-thread to read file and is much more faster then the Python read
-thread in py_reader.
-
-Currently, it support two types of file:
- - gzip
- - plain text file
-
-and two types of data format:
- - cvs data format is :
-   * label dense_fea,dense_fea sparse_fea,sparse_fea
- - the svm data format is :
-   * label slot1:fea_sign slot2:fea_sign slot1:fea_sign
-
-## Distributed reader
-
-The distributed reader is mainly used by multi-process tasks, and the input must be a batch reader.
-
-Cons:
-  - It can be operated conveniently so that different processes can read different data.
-
-Pros:
-  - If batch_reader produces training data, and batch_reader loads or preprocesses data for a long time, this data reading method may be slower.
diff --git a/python/paddle/fluid/contrib/reader/distributed_reader.py b/python/paddle/fluid/contrib/reader/distributed_reader.py
deleted file mode 100644
index ecee769218f547..00000000000000
--- a/python/paddle/fluid/contrib/reader/distributed_reader.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import os
-
-__all__ = ["distributed_batch_reader"]
-
-
-def distributed_batch_reader(batch_reader):
-    """
-    Create a reader for multi-process training. The input must be a batch reader.
-
-    Args:
-        batch_reader (callable): The input reader should be a batch reader.
-
-    Examples:
-
-    .. code-block:: python
-           import paddle
-           import paddle.fluid as fluid
-
-           train_reader = paddle.batch(paddle.dataset.mnist.train(),
-                    batch_size=32,drop_last=True)
-           train_reader = fluid.contrib.reader.distributed_batch_reader(
-                    train_reader)
-
-    """
-    trainers_num = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
-    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
-    assert trainer_id < trainers_num
-
-    def decorate_for_multi_process():
-        if trainers_num > 1:
-            print("start data reader (trainers_num: {}, trainer_id: {})".format(
-                trainers_num, trainer_id))
-
-        train_data, idx = None, 1
-        for batch_id, data in enumerate(batch_reader()):
-            if trainers_num > 1:
-                if idx < trainers_num:
-                    if idx == trainer_id + 1:
-                        train_data = data
-                    idx += 1
-                else:
-                    if idx == trainer_id + 1:
-                        train_data = data
-                    assert train_data is not None, "train data should not be None."
-                    yield train_data
-                    train_data, idx = None, 1
-            else:
-                yield data
-
-    return decorate_for_multi_process
diff --git a/python/paddle/fluid/contrib/tests/test_bf16_utils.py b/python/paddle/fluid/contrib/tests/test_bf16_utils.py
index faf2307f8147bf..41aa5e5412df58 100644
--- a/python/paddle/fluid/contrib/tests/test_bf16_utils.py
+++ b/python/paddle/fluid/contrib/tests/test_bf16_utils.py
@@ -14,7 +14,7 @@
 import copy
 import unittest
 import paddle.fluid as fluid
-import paddle.fluid.contrib.mixed_precision as amp
+import paddle.static.amp as amp
 from paddle.fluid import core
 import paddle
 
@@ -34,34 +34,34 @@ def tearDown(self):
         self.assertEqual(self.amp_lists_.gray_list, self.gray_list)
 
     def test_amp_lists(self):
-        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16()
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16()
 
     def test_amp_lists_1(self):
         # 1. w={'exp}, b=None
         self.bf16_list.add('exp')
         self.fp32_list.remove('exp')
 
-        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16({'exp'})
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16({'exp'})
 
     def test_amp_lists_2(self):
         # 2. w={'tanh'}, b=None
         self.fp32_list.remove('tanh')
         self.bf16_list.add('tanh')
 
-        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16({'tanh'})
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16({'tanh'})
 
     def test_amp_lists_3(self):
         # 3. w={'lstm'}, b=None
         self.bf16_list.add('lstm')
 
-        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16({'lstm'})
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16({'lstm'})
 
     def test_amp_lists_4(self):
         # 4. w=None, b={'elementwise_add'}
         self.bf16_list.remove('elementwise_add')
         self.fp32_list.add('elementwise_add')
 
-        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16(
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
             custom_fp32_list={'elementwise_add'})
 
     def test_amp_lists_5(self):
@@ -69,28 +69,28 @@ def test_amp_lists_5(self):
         self.fp32_list.add('elementwise_add')
         self.bf16_list.remove('elementwise_add')
 
-        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16(
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
             custom_fp32_list={'elementwise_add'})
 
     def test_amp_lists_6(self):
         # 6. w=None, b={'lstm'}
         self.fp32_list.add('lstm')
 
-        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16(
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
             custom_fp32_list={'lstm'})
 
     def test_amp_lists_7(self):
         self.fp32_list.add('reshape2')
         self.gray_list.remove('reshape2')
 
-        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16(
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
             custom_fp32_list={'reshape2'})
 
     def test_amp_list_8(self):
         self.bf16_list.add('reshape2')
         self.gray_list.remove('reshape2')
 
-        self.amp_lists_ = amp.AutoMixedPrecisionListsBF16(
+        self.amp_lists_ = amp.bf16.AutoMixedPrecisionListsBF16(
             custom_bf16_list={'reshape2'})
 
 
@@ -98,7 +98,7 @@ class AMPTest2(unittest.TestCase):
     def test_amp_lists_(self):
         # 7. w={'lstm'} b={'lstm'}
         # raise ValueError
-        self.assertRaises(ValueError, amp.AutoMixedPrecisionListsBF16,
+        self.assertRaises(ValueError, amp.bf16.AutoMixedPrecisionListsBF16,
                           {'lstm'}, {'lstm'})
 
     def test_find_op_index(self):
@@ -117,10 +117,10 @@ def test_is_in_fp32_varnames(self):
             type="abs", inputs={"X": [var1]}, outputs={"Out": [var2]})
         op2 = block.append_op(
             type="abs", inputs={"X": [var2]}, outputs={"Out": [var3]})
-        amp_lists_1 = amp.AutoMixedPrecisionListsBF16(
+        amp_lists_1 = amp.bf16.AutoMixedPrecisionListsBF16(
             custom_fp32_varnames={'X'})
         assert amp.bf16.amp_utils._is_in_fp32_varnames(op1, amp_lists_1)
-        amp_lists_2 = amp.AutoMixedPrecisionListsBF16(
+        amp_lists_2 = amp.bf16.AutoMixedPrecisionListsBF16(
             custom_fp32_varnames={'Y'})
         assert amp.bf16.amp_utils._is_in_fp32_varnames(op2, amp_lists_2)
         assert amp.bf16.amp_utils._is_in_fp32_varnames(op1, amp_lists_2)
@@ -139,6 +139,29 @@ def test_find_true_post_op(self):
         res = amp.bf16.amp_utils.find_true_post_op(block.ops, op1, "Y")
         assert (res == [op2])
 
+    def test_find_true_post_op_with_search_all(self):
+        program = fluid.Program()
+        block = program.current_block()
+        startup_block = fluid.default_startup_program().global_block()
+
+        var1 = block.create_var(name="X", shape=[3], dtype='float32')
+        var2 = block.create_var(name="Y", shape=[3], dtype='float32')
+        inititializer_op = startup_block._prepend_op(
+            type="fill_constant",
+            outputs={"Out": var1},
+            attrs={"shape": var1.shape,
+                   "dtype": var1.dtype,
+                   "value": 1.0})
+
+        op1 = block.append_op(
+            type="abs", inputs={"X": [var1]}, outputs={"Out": [var2]})
+        result = amp.bf16.amp_utils.find_true_post_op(
+            block.ops, inititializer_op, "X", search_all=False)
+        assert (len(result) == 0)
+        result = amp.bf16.amp_utils.find_true_post_op(
+            block.ops, inititializer_op, "X", search_all=True)
+        assert (result == [op1])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/test_distributed_reader.py b/python/paddle/fluid/contrib/tests/test_distributed_reader.py
deleted file mode 100644
index b964168eb3a2f1..00000000000000
--- a/python/paddle/fluid/contrib/tests/test_distributed_reader.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid as fluid
-import os
-
-
-def data_generator():
-    data = [0, 1, 2, 3]
-    for val in data:
-        yield val
-
-
-class TestDistributedReader(unittest.TestCase):
-    def test_distributed_reader(self):
-        trainer_num = 4
-        os.environ['PADDLE_TRAINER_ID'] = str(1)
-        os.environ['PADDLE_TRAINERS_NUM'] = str(trainer_num)
-
-        reader = fluid.contrib.reader.distributed_batch_reader(data_generator)
-        data = next(reader())
-        assert data == 1
-
-        #Note: windows python3 don't have unsetenv
-        del os.environ['PADDLE_TRAINER_ID']
-        del os.environ['PADDLE_TRAINERS_NUM']
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py b/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py
index 40ddcf2e66b75f..470073543c3be5 100644
--- a/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py
+++ b/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py
@@ -53,25 +53,33 @@ def scope_prog_guard(self):
             with fluid.program_guard(prog, startup_prog):
                 yield
 
-    def get_static_graph_result(self, feed, fetch_list, amp_fun,
-                                with_lod=False):
+    def get_static_graph_result(self,
+                                feed,
+                                fetch_list,
+                                amp_fun,
+                                with_lod=False,
+                                startup_prog=None):
         exe = fluid.Executor(core.CPUPlace())
-        exe.run(fluid.default_startup_program())
+        exe.run(fluid.default_startup_program()
+                if startup_prog is None else startup_prog)
         prog = fluid.default_main_program()
         if amp_fun is not None:
-            amp_fun(prog)
+            if startup_prog is not None:
+                amp_fun(prog, startup_prog)
+            else:
+                amp_fun(prog)
         return exe.run(prog,
                        feed=feed,
                        fetch_list=fetch_list,
                        return_numpy=(not with_lod))
 
-    def test_graph_rewrite(self):
+    def _graph_common(self, _amp_fun, startup_prog=None):
         size = 3
         n = np.ones([size, size], dtype='float32') * 3.2
         nn = np.ones([size, size], dtype='float32') * -2.7
 
-        n_bf16 = amp.convert_float_to_uint16(n)
-        nn_bf16 = amp.convert_float_to_uint16(nn)
+        n_bf16 = amp.bf16.convert_float_to_uint16(n)
+        nn_bf16 = amp.bf16.convert_float_to_uint16(nn)
 
         with self.static_graph():
             t_bf16 = layers.data(
@@ -85,12 +93,12 @@ def test_graph_rewrite(self):
             ret = layers.elementwise_mul(ret, t)
             ret = layers.reshape(ret, [0, 0])
 
-            with amp.bf16_guard():
+            with amp.bf16.bf16_guard():
                 ret_bf16 = layers.elementwise_add(t_bf16, tt_bf16)
                 ret_bf16 = layers.elementwise_mul(ret_bf16, t_bf16)
                 ret_bf16 = layers.reshape(ret_bf16, [0, 0])
 
-            with amp.bf16_guard():
+            with amp.bf16.bf16_guard():
                 ret_fp32bf16 = layers.elementwise_add(t, tt)
                 ret_fp32bf16 = layers.elementwise_mul(ret_fp32bf16, t)
                 ret_fp32bf16 = layers.reshape(ret_fp32bf16, [0, 0])
@@ -103,7 +111,7 @@ def test_graph_rewrite(self):
                     'tt_bf16': nn_bf16,
                 },
                 fetch_list=[ret_bf16, ret, ret_fp32bf16],
-                amp_fun=lambda prog: amp.rewrite_program_bf16(prog, use_bf16_guard=True))
+                amp_fun=lambda prog: amp.bf16.rewrite_program_bf16(prog))
 
         self.assertTrue(np.allclose(static_ret_bf16, static_ret, 1e-2))
         self.assertTrue(np.allclose(static_ret_bf16, ret_fp32bf16, 1e-2))
@@ -112,7 +120,7 @@ def test_graph_rewrite(self):
             t = layers.data(name='t', shape=[size, size], dtype='float32')
             tt = layers.data(name='tt', shape=[size, size], dtype='float32')
 
-            with amp.bf16_guard():
+            with amp.bf16.bf16_guard():
                 ret = layers.elementwise_add(t, tt)
                 ret = layers.reshape(ret, [0, 0], act='elu')
                 ret = layers.elementwise_mul(ret, t)
@@ -122,17 +130,29 @@ def test_graph_rewrite(self):
                 self.get_static_graph_result(
                     feed={'t': n, 'tt': nn},
                     fetch_list=[ret],
-                    amp_fun=lambda prog: amp.rewrite_program_bf16(
-                        prog,
-                        amp.AutoMixedPrecisionListsBF16(
-                            custom_fp32_varnames={'elementwise_add_0.tmp_0'}),
-                        use_bf16_guard=True
-                    )
+                    amp_fun=_amp_fun,
+                    startup_prog=startup_prog
                 )
         self.assertTrue(
             static_ret_bf16, np.ones(
                 [size, size], dtype='float32') * -1.1)
 
+    def test_graph_rewrite(self):
+        self._graph_common(lambda prog: amp.bf16.rewrite_program_bf16(
+            prog,
+            amp.bf16.AutoMixedPrecisionListsBF16(
+                custom_fp32_varnames={'elementwise_add_0.tmp_0'})
+        ))
+
+    def test_graph_cast(self):
+        self._graph_common(lambda prog, startup_prog: amp.bf16.cast_model_to_bf16(
+            prog,
+            startup_prog,
+            amp.bf16.AutoMixedPrecisionListsBF16(
+                custom_fp32_list={'elementwise_mul'}),
+            use_bf16_guard=True
+        ), startup_prog=fluid.default_startup_program())
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py
deleted file mode 100644
index 9572552f0f2be4..00000000000000
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ /dev/null
@@ -1,603 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""hdfs_utils.py will move to fluid/incubate/fleet/utils/hdfs.py"""
-
-import os
-import sys
-import subprocess
-import multiprocessing
-from datetime import datetime
-
-import re
-import copy
-import errno
-
-import logging
-from paddle.fluid.log_helper import get_logger
-
-__all__ = ["HDFSClient", "multi_download", "multi_upload"]
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-class HDFSClient(object):
-    r"""
-    A tool of HDFS 
-
-    Args:
-        hadoop_home (string): hadoop_home 
-        configs (dict): hadoop config, it is a dict, please contain \
-            key "fs.default.name" and "hadoop.job.ugi"
-        Can be a float value
-    Examples:
-        hadoop_home = "/home/client/hadoop-client/hadoop/"
-
-        configs = {
-            "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-            "hadoop.job.ugi": "hello,hello123"
-        }
-
-        client = HDFSClient(hadoop_home, configs)
-
-        client.ls("/user/com/train-25")
-        files = client.lsr("/user/com/train-25/models")
-    """
-
-    def __init__(self, hadoop_home, configs):
-        self.pre_commands = []
-        hadoop_bin = '%s/bin/hadoop' % hadoop_home
-        self.pre_commands.append(hadoop_bin)
-        dfs = 'fs'
-        self.pre_commands.append(dfs)
-
-        for k, v in configs.items():
-            config_command = '-D%s=%s' % (k, v)
-            self.pre_commands.append(config_command)
-
-    def __run_hdfs_cmd(self, commands, retry_times=5):
-        whole_commands = copy.deepcopy(self.pre_commands)
-        whole_commands.extend(commands)
-
-        print('Running system command: {0}'.format(' '.join(whole_commands)))
-
-        ret_code = 0
-        ret_out = None
-        ret_err = None
-        whole_commands = " ".join(whole_commands)
-        for x in range(retry_times + 1):
-            proc = subprocess.Popen(
-                whole_commands,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                shell=True)
-            (output, errors) = proc.communicate()
-            ret_code, ret_out, ret_err = proc.returncode, output, errors
-            if ret_code:
-                _logger.warn(
-                    'Times: %d, Error running command: %s. Return code: %d, Error: %s'
-                    % (x, ' '.join(whole_commands), proc.returncode, errors))
-            else:
-                break
-        return ret_code, ret_out, ret_err
-
-    def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5):
-        """
-        upload the local file to hdfs
-
-        Args:
-            hdfs_path(str): the hdfs file path
-            local_path(str): the local file path
-            overwrite(bool|None): will overwrite the file on HDFS or not
-            retry_times(int|5): retry times
-
-        Returns:
-                True or False
-        """
-        assert hdfs_path is not None
-        assert local_path is not None and os.path.exists(local_path)
-
-        if os.path.isdir(local_path):
-            _logger.warn(
-                "The Local path: {} is dir and I will support it later, return".
-                format(local_path))
-            return False
-
-        base = os.path.basename(local_path)
-        if not self.is_exist(hdfs_path):
-            self.makedirs(hdfs_path)
-        else:
-            if self.is_exist(os.path.join(hdfs_path, base)):
-                if overwrite:
-                    _logger.error(
-                        "The HDFS path: {} is exist and overwrite is True, delete it".
-                        format(hdfs_path))
-                    self.delete(hdfs_path)
-                else:
-                    _logger.error(
-                        "The HDFS path: {} is exist and overwrite is False, return".
-                        format(hdfs_path))
-                    return False
-
-        put_commands = ["-put", local_path, hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(put_commands,
-                                                         retry_times)
-        if returncode:
-            _logger.error("Put local path: {} to HDFS path: {} failed".format(
-                local_path, hdfs_path))
-            return False
-        else:
-            _logger.info("Put local path: {} to HDFS path: {} successfully".
-                         format(local_path, hdfs_path))
-            return True
-
-    def download(self, hdfs_path, local_path, overwrite=False, unzip=False):
-        """
-        download file from HDFS
-
-        Args:
-            hdfs_path(str): the hdfs file path
-            local_path(str): the local file path
-            overwrite(bool|None): will overwrite the file on HDFS or not
-            unzip(bool|False): if the download file is compressed by zip, unzip it or not.
-
-        Returns:
-            True or False
-        """
-        _logger.info('Downloading %r to %r.', hdfs_path, local_path)
-        _logger.info('Download of %s to %r complete.', hdfs_path, local_path)
-
-        if not self.is_exist(hdfs_path):
-            print("HDFS path: {} do not exist".format(hdfs_path))
-            return False
-        if self.is_dir(hdfs_path):
-            _logger.error(
-                "The HDFS path: {} is dir and I will support it later, return".
-                format(hdfs_path))
-
-        if os.path.exists(local_path):
-            base = os.path.basename(hdfs_path)
-            local_file = os.path.join(local_path, base)
-            if os.path.exists(local_file):
-                if overwrite:
-                    os.remove(local_file)
-                else:
-                    _logger.error(
-                        "The Local path: {} is exist and overwrite is False, return".
-                        format(local_file))
-                    return False
-
-        self.make_local_dirs(local_path)
-
-        download_commands = ["-get", hdfs_path, local_path]
-        returncode, output, errors = self.__run_hdfs_cmd(download_commands)
-        if returncode:
-            _logger.error("Get local path: {} from HDFS path: {} failed".format(
-                local_path, hdfs_path))
-            return False
-        else:
-            _logger.info("Get local path: {} from HDFS path: {} successfully".
-                         format(local_path, hdfs_path))
-            return True
-
-    def is_exist(self, hdfs_path=None):
-        """
-        whether the remote HDFS path exists
-
-        Args:
-            hdfs_path(str): the hdfs file path
-
-        Returns:
-            True or False
-        """
-        exist_cmd = ['-test', '-e', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            exist_cmd, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS is_exist HDFS path: {} failed".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS is_exist HDFS path: {} successfully".format(
-                hdfs_path))
-            return True
-
-    def is_dir(self, hdfs_path=None):
-        """
-        whether the remote HDFS path is directory
-
-        Args:
-            hdfs_path(str): the hdfs file path
-
-        Returns:
-            True or False
-        """
-
-        if not self.is_exist(hdfs_path):
-            return False
-
-        dir_cmd = ['-test', '-d', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(dir_cmd, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS path: {} failed is not a directory".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS path: {} successfully is a directory".format(
-                hdfs_path))
-            return True
-
-    def delete(self, hdfs_path):
-        """
-        Remove a file or directory from HDFS.
-
-        whether the remote HDFS path exists
-
-        Args:
-        hdfs_path: HDFS path.
-
-        Returns:
-            True or False
-            This function returns `True` if the deletion was successful and `False` if
-            no file or directory previously existed at `hdfs_path`.
-        """
-        _logger.info('Deleting %r.', hdfs_path)
-
-        if not self.is_exist(hdfs_path):
-            _logger.warn("HDFS path: {} do not exist".format(hdfs_path))
-            return True
-
-        if self.is_dir(hdfs_path):
-            del_cmd = ['-rmr', hdfs_path]
-        else:
-            del_cmd = ['-rm', hdfs_path]
-
-        returncode, output, errors = self.__run_hdfs_cmd(del_cmd, retry_times=0)
-
-        if returncode:
-            _logger.error("HDFS path: {} delete files failure".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS path: {} delete files successfully".format(
-                hdfs_path))
-            return True
-
-    def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
-        """
-        Move a file or folder on HDFS.
-
-        Args:
-        hdfs_path(str): HDFS path.
-        overwrite(bool|False): If the path already exists and overwrite is False, will return False.
-
-        Returns:
-            True or False
-        """
-        assert hdfs_src_path is not None
-        assert hdfs_dst_path is not None
-
-        if not self.is_exist(hdfs_src_path):
-            _logger.info("HDFS path do not exist: {}".format(hdfs_src_path))
-        if self.is_exist(hdfs_dst_path) and not overwrite:
-            _logger.error("HDFS path is exist: {} and overwrite=False".format(
-                hdfs_dst_path))
-
-        rename_command = ['-mv', hdfs_src_path, hdfs_dst_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            rename_command, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS rename path: {} to {} failed".format(
-                hdfs_src_path, hdfs_dst_path))
-            return False
-        else:
-            _logger.info("HDFS rename path: {} to {} successfully".format(
-                hdfs_src_path, hdfs_dst_path))
-            return True
-
-    @staticmethod
-    def make_local_dirs(local_path):
-        """
-        create a directory local, is same to mkdir
-        Args:
-            local_path: local path that wants to create a directory.
-        """
-        try:
-            os.makedirs(local_path)
-        except OSError as e:
-            if e.errno != errno.EEXIST:
-                raise
-
-    def makedirs(self, hdfs_path):
-        """
-        Create a remote directory, recursively if necessary.
-
-        Args:
-        hdfs_path(str): Remote path. Intermediate directories will be created appropriately.
-
-        Returns:
-            True or False
-        """
-        _logger.info('Creating directories to %r.', hdfs_path)
-        assert hdfs_path is not None
-
-        if self.is_exist(hdfs_path):
-            _logger.error("HDFS path is exist: {}".format(hdfs_path))
-            return
-
-        mkdirs_commands = ['-mkdir', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            mkdirs_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS mkdir path: {} failed".format(hdfs_path))
-            return False
-        else:
-            _logger.error("HDFS mkdir path: {} successfully".format(hdfs_path))
-            return True
-
-    def ls(self, hdfs_path):
-        """
-        ls directory contents about HDFS hdfs_path
-
-        Args:
-        hdfs_path(str): Remote HDFS path will be ls.
-
-        Returns:
-            List: a contents list about hdfs_path.
-        """
-        assert hdfs_path is not None
-
-        if not self.is_exist(hdfs_path):
-            return []
-
-        ls_commands = ['-ls', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            ls_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS list path: {} failed".format(hdfs_path))
-            return []
-        else:
-            _logger.info("HDFS list path: {} successfully".format(hdfs_path))
-
-            ret_lines = []
-            regex = re.compile(r'\s+')
-            out_lines = output.strip().split("\n")
-            for line in out_lines:
-                re_line = regex.split(line)
-                if len(re_line) == 8:
-                    ret_lines.append(re_line[7])
-            return ret_lines
-
-    def lsr(self, hdfs_path, only_file=True, sort=True):
-        """
-        list directory contents about HDFS hdfs_path recursively
-
-        Args:
-        hdfs_path(str): Remote HDFS path.
-        only_file(bool|True): will discard folders.
-        sort(bool|True): will be sorted by create time.
-
-        Returns:
-            List: a contents list about hdfs_path.
-        """
-
-        def sort_by_time(v1, v2):
-            v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M')
-            v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M')
-            return v1_time > v2_time
-
-        assert hdfs_path is not None
-
-        if not self.is_exist(hdfs_path):
-            return []
-
-        ls_commands = ['-lsr', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            ls_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS list all files: {} failed".format(hdfs_path))
-            return []
-        else:
-            _logger.info("HDFS list all files: {} successfully".format(
-                hdfs_path))
-            lines = []
-            regex = re.compile(r'\s+')
-            out_lines = output.strip().split("\n")
-            for line in out_lines:
-                re_line = regex.split(line)
-                if len(re_line) == 8:
-                    if only_file and re_line[0][0] == "d":
-                        continue
-                    else:
-                        lines.append(
-                            (re_line[7], re_line[5] + " " + re_line[6]))
-            if sort:
-                sorted(lines, cmp=sort_by_time)
-            ret_lines = [ret[0] for ret in lines]
-            return ret_lines
-
-
-def multi_download(client,
-                   hdfs_path,
-                   local_path,
-                   trainer_id,
-                   trainers,
-                   multi_processes=5):
-    """
-    Download files from HDFS using multi process.
-
-    Args:
-        client(HDFSClient): instance of HDFSClient
-        hdfs_path(str): path on hdfs
-        local_path(str): path on local
-        trainer_id(int): current trainer id
-        trainers(int): all trainers number
-        multi_processes(int|5): the download data process at the same time, default=5
-
-    Returns:
-        List:
-        Download files in local folder.
-    """
-
-    def __subprocess_download(datas):
-        for data in datas:
-            re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
-            if re_path == os.curdir:
-                sub_local_re_path = local_path
-            else:
-                sub_local_re_path = os.path.join(local_path, re_path)
-            client.download(data, sub_local_re_path)
-
-    assert isinstance(client, HDFSClient)
-
-    client.make_local_dirs(local_path)
-    _logger.info("Make local dir {} successfully".format(local_path))
-
-    all_need_download = client.lsr(hdfs_path, sort=True)
-    need_download = all_need_download[trainer_id::trainers]
-    _logger.info("Get {} files From all {} files need to be download from {}".
-                 format(len(need_download), len(all_need_download), hdfs_path))
-
-    _logger.info("Start {} multi process to download datas".format(
-        multi_processes))
-    procs = []
-    for i in range(multi_processes):
-        process_datas = need_download[i::multi_processes]
-        p = multiprocessing.Process(
-            target=__subprocess_download, args=(process_datas, ))
-        procs.append(p)
-        p.start()
-
-    # complete the processes
-    for proc in procs:
-        proc.join()
-
-    _logger.info("Finish {} multi process to download datas".format(
-        multi_processes))
-
-    local_downloads = []
-    for data in need_download:
-        data_name = os.path.basename(data)
-        re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
-        if re_path == os.curdir:
-            local_re_path = os.path.join(local_path, data_name)
-        else:
-            local_re_path = os.path.join(local_path, re_path, data_name)
-        local_downloads.append(local_re_path)
-
-    return local_downloads
-
-
-def getfilelist(path):
-    rlist = []
-    for dir, folder, file in os.walk(path):
-        for i in file:
-            t = os.path.join(dir, i)
-            rlist.append(t)
-    for r in rlist:
-        print(r)
-
-
-def multi_upload(client,
-                 hdfs_path,
-                 local_path,
-                 multi_processes=5,
-                 overwrite=False,
-                 sync=True):
-    """
-    Upload files to HDFS using multi process.
-
-    Args:
-        client(HDFSClient): instance of HDFSClient
-        hdfs_path(str): path on hdfs
-        local_path(str): path on local
-        multi_processes(int|5): the upload data process at the same time, default=5
-        overwrite(bool|False): will overwrite file on HDFS or not
-        sync(bool|True): upload files sync or not.
-
-    Returns:
-        None
-    """
-
-    def __subprocess_upload(datas):
-        for data in datas:
-            re_path = os.path.relpath(os.path.dirname(data), local_path)
-            hdfs_re_path = os.path.join(hdfs_path, re_path)
-            client.upload(hdfs_re_path, data, overwrite, retry_times=5)
-
-    def get_local_files(path):
-        rlist = []
-
-        if not os.path.isdir(path):
-            return rlist
-
-        for dirname, folder, files in os.walk(path):
-            for i in files:
-                t = os.path.join(dirname, i)
-                rlist.append(t)
-        return rlist
-
-    assert isinstance(client, HDFSClient)
-
-    all_files = get_local_files(local_path)
-    if not all_files:
-        _logger.info("there are nothing need to upload, exit")
-        return
-    _logger.info("Start {} multi process to upload datas".format(
-        multi_processes))
-    procs = []
-    for i in range(multi_processes):
-        process_datas = all_files[i::multi_processes]
-        p = multiprocessing.Process(
-            target=__subprocess_upload, args=(process_datas, ))
-        procs.append(p)
-        p.start()
-
-    # complete the processes
-    for proc in procs:
-        proc.join()
-
-    _logger.info("Finish {} multi process to upload datas".format(
-        multi_processes))
-
-
-if __name__ == "__main__":
-    hadoop_home = "/home/client/hadoop-client/hadoop/"
-
-    configs = {
-        "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-        "hadoop.job.ugi": "hello,hello123"
-    }
-
-    client = HDFSClient(hadoop_home, configs)
-
-    client.ls("/user/com/train-25")
-    files = client.lsr("/user/com/train-25/models")
-
-    downloads = multi_download(
-        client,
-        "/user/com/train-25/model",
-        "/home/xx/data1",
-        1,
-        5,
-        100,
-        multi_processes=5)
-
-    multi_upload(client, "/user/com/train-25/model", "/home/xx/data1")
diff --git a/python/paddle/fluid/contrib/utils/lookup_table_utils.py b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
deleted file mode 100644
index 7d30de565e7a41..00000000000000
--- a/python/paddle/fluid/contrib/utils/lookup_table_utils.py
+++ /dev/null
@@ -1,496 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""lookup_table_utils.py will move to fluid/incubate/fleet/utils/lookup_table.py"""
-
-from __future__ import print_function
-
-import os
-import time
-import logging
-
-import paddle
-from paddle.fluid import core
-from paddle.fluid import io
-from paddle.fluid import Program
-from paddle.fluid.log_helper import get_logger
-
-__all__ = [
-    "load_persistables_for_increment", "load_persistables_for_inference",
-    "convert_dist_to_sparse_program"
-]
-
-_logger = get_logger(
-    'lookup_table_utils',
-    logging.INFO,
-    fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-model_filename = "__model__"
-lookup_table_dir = "__lookup_table__"
-
-
-def __insert_lookup_sparse_table_op(main_program, idx, ids, w, out):
-    main_program.global_block()._insert_op(
-        index=idx,
-        type="lookup_sparse_table",
-        inputs={"Ids": [ids],
-                "W": [w]},
-        outputs={"Out": [out]},
-        attrs={
-            "is_distributed": False,
-            "is_sparse": True,
-            "grad_inplace": False
-        })
-
-
-def __get_prefetch_op_tuples(main_program):
-    # current lookup tables op is split_ids->prefetch->merge_ids
-    prefetch_op_tuples = None
-    op_types = [op.type for op in main_program.global_block().ops]
-
-    for i in range(len(op_types)):
-        if op_types[i] == "prefetch":
-            if op_types[i - 1] == "split_ids" and op_types[i +
-                                                           1] == "merge_ids":
-                split_ids_op_id = i - 1
-                split_ids_inputs = main_program.global_block().ops[i - 1].input(
-                    "Ids")
-                prefetch_op_inputs = main_program.global_block().ops[i].input(
-                    "X")
-                prefetch_op_outputs = main_program.global_block().ops[i].output(
-                    "Out")
-                merge_ids_outputs = main_program.global_block().ops[
-                    i + 1].output("Out")
-
-                need_delete_vars = []
-                need_delete_vars.extend(prefetch_op_inputs)
-                need_delete_vars.extend(prefetch_op_outputs)
-
-                prefetch_op_tuples = (split_ids_op_id, split_ids_inputs,
-                                      merge_ids_outputs, need_delete_vars)
-                break
-    return prefetch_op_tuples
-
-
-def convert_dist_to_sparse_program(program):
-    """
-    WARNING: this function will only be used for distributed training with distributed lookup table.
-    when we train model with distributed lookup table but want to do the local inference, we can use
-    this function to convert the train program with distributed lookup table to sparse lookup table.
-
-    Args:
-        program(Program): the program must be the trainer program, which will be get by the distribute transpiler.
-    Returns:
-        program: The `program` is a Program, it's the program replace distributed lookup table to sparse lookup table.
-    """
-    if not program._distributed_lookup_table:
-        _logger.warn(
-            "There are no distributed lookup tables need to be converted")
-        return
-
-    # create table param and grad var in pserver program
-    origin_emb_var = "{}.origin".format(program._distributed_lookup_table)
-    emb_var = program._distributed_lookup_table
-    program.global_block()._rename_var(emb_var, origin_emb_var)
-    origin_param_var = program.global_block().vars[origin_emb_var]
-
-    param_var = program.global_block().create_var(
-        name=emb_var,
-        shape=origin_param_var.shape,
-        dtype=origin_param_var.dtype,
-        type=core.VarDesc.VarType.SELECTED_ROWS,
-        persistable=True)
-    # parameter must be selected rows
-    param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-    program._sync_with_cpp()
-
-    prefetch_op_tuples = __get_prefetch_op_tuples(program)
-
-    split_ids_id = prefetch_op_tuples[0]
-
-    for idx in range(split_ids_id + 2, split_ids_id - 1, -1):
-        program.global_block()._remove_op(idx)
-    program.desc.flush()
-
-    in_out_pairs = zip(prefetch_op_tuples[1], prefetch_op_tuples[2])
-
-    for in_out_pair in in_out_pairs:
-        idx = split_ids_id
-        ids = program.global_block().vars[in_out_pair[0]]
-        out = program.global_block().vars[in_out_pair[1]]
-        __insert_lookup_sparse_table_op(program, idx, ids, param_var, out)
-        program.desc.flush()
-    return program
-
-
-def load_persistables_for_increment(dirname, executor, program,
-                                    lookup_table_var, lookup_table_var_path):
-    """
-    WARNING: this function will only be used for distributed training with distributed lookup table.
-    for increment training, the pserver will not only load dense variables,
-    but also load the suitable lookup table var. Because of sliced lookup table
-    var with HASH, we must load the correct sliced var.
-
-    Args:
-        dirname(str): The directory path
-        executor(Executor): The executor to run for loading inference model.
-        program(Program): The parameter server program, which will run on Pserver.
-        lookup_table_var: the distributed lookup tables var name.
-        lookup_table_var_path: the the distributed lookup tables var location.
-
-    Returns:
-        None
-    """
-
-    def _load_persistable_vars(executor, dirname, need_load_vars):
-        load_prog = Program()
-        load_block = load_prog.global_block()
-        need_delete_vars = []
-
-        for param in need_load_vars:
-            origin_var = param.origin
-            slice_var = param.slice
-            is_slice = param.is_slice
-            offset = param.offset
-
-            if is_slice:
-                origin = load_block.create_var(
-                    name="{}.load".format(origin_var.name),
-                    type=origin_var.type,
-                    shape=origin_var.shape,
-                    dtype=origin_var.dtype,
-                    persistable=True)
-
-                load_block.append_op(
-                    type='load',
-                    inputs={},
-                    outputs={'Out': [origin]},
-                    attrs={
-                        'file_path': os.path.join(dirname, origin_var.name)
-                    })
-
-                slice = load_block.create_var(
-                    name=slice_var.name,
-                    type=slice_var.type,
-                    shape=slice_var.shape,
-                    dtype=slice_var.dtype,
-                    persistable=True)
-
-                dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
-                start = int(offset / dim1_flatten)
-                end = int(offset / dim1_flatten + slice.shape[0])
-
-                load_block.append_op(
-                    type="slice",
-                    inputs={'Input': origin},
-                    outputs={'Out': slice},
-                    attrs={'axes': [0],
-                           'starts': [start],
-                           'ends': [end]})
-
-                need_delete_vars.append(origin)
-            else:
-                origin = load_block.create_var(
-                    name="{}".format(origin_var.name),
-                    type=origin_var.type,
-                    shape=origin_var.shape,
-                    dtype=origin_var.dtype,
-                    persistable=True)
-                load_block.append_op(
-                    type='load',
-                    inputs={},
-                    outputs={'Out': [origin]},
-                    attrs={
-                        'file_path': os.path.join(dirname, origin_var.name)
-                    })
-
-        load_block.append_op(
-            type='delete_var',
-            inputs={'X': need_delete_vars}, )
-
-        executor.run(load_prog)
-
-    def __load_lookup_table_vars(executor, main_program, lookup_table_var,
-                                 lookup_table_var_path):
-        emb_var = main_program.global_block().var(lookup_table_var)
-
-        load_program = Program()
-        load_block = load_program.global_block()
-        load_block.append_op(
-            type='load',
-            inputs={},
-            outputs={'Out': [emb_var]},
-            attrs={'file_path': lookup_table_var_path})
-        executor.run(load_program)
-
-    if not os.path.isdir(dirname):
-        raise ValueError("There is no directory named '%s'", dirname)
-
-    if not os.path.exists(lookup_table_var_path):
-        raise ValueError("There is no file named '%s'", lookup_table_var_path)
-
-    if not isinstance(program, Program):
-        raise ValueError("program must be an instance of fluid.Program")
-
-    _logger.info("Start Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-    need_load_vars = program._parameters_on_pservers.get_distributed_vars_by_ep(
-        program._ps_endpoint)
-    _load_persistable_vars(executor, dirname, need_load_vars)
-    __load_lookup_table_vars(executor, program, lookup_table_var,
-                             lookup_table_var_path)
-
-    _logger.info("Finish Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-
-def load_persistables_for_inference(dirname, executor, program,
-                                    lookup_table_var_name):
-    """
-    WARNING: this function will only be used for inference with distributed lookup table.
-    Inference with distributed lookup table is a little funky, this function will load distributed
-    lookup table vars into sparse var, can be used in local inference mode.
-
-    Args:
-        dirname(str): The directory path
-        executor(Executor): The executor to run for loading inference model.
-        program(Program): The parameter server program, which will run on Pserver.
-        lookup_table_var_name: the distributed lookup tables var name.
-    Returns:
-        None
-    """
-
-    def _load_persistable_vars(executor, dirname, program, lookup_table_vars):
-        def _is_checkpoint_var(exclude_fluid_vars=None):
-            """
-            the checkpoint will not save or load all the variables.
-            var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
-
-            : param var(Variable)
-            """
-
-            if exclude_fluid_vars is None:
-                exclude_fluid_vars = []
-
-            def is_valid(var):
-                if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                        var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                        var.desc.type() == core.VarDesc.VarType.RAW:
-                    return False
-                # @GRAD are named for gradient variables, checkpoint will not save it.
-                if "@GRAD" in var.name:
-                    return False
-                # .trainer_ are named for distribute train variables, checkpoint will not save it.
-                if ".trainer_" in var.name:
-                    return False
-
-                # .block is named for distribute train variables, checkpoint will not save it.
-                if ".block" in var.name:
-                    return False
-
-                if "tmp_" in var.name:
-                    return False
-
-                if var.name in exclude_fluid_vars:
-                    return False
-
-                return var.persistable
-
-            return is_valid
-
-        io.load_vars(
-            executor,
-            dirname=dirname,
-            main_program=program,
-            predicate=_is_checkpoint_var(lookup_table_vars),
-            filename=None)
-
-    def _load_lookup_table_vars(executor, dirname, main_program,
-                                lookup_table_vars):
-        if not os.path.isdir(dirname):
-            raise ValueError("There is no directory named '%s'", dirname)
-
-        lookup_table_dirname = os.path.join(dirname, lookup_table_dir)
-
-        emb_var_name = lookup_table_vars[0]
-        emb_var = main_program.global_block().var(emb_var_name)
-
-        emb_files = []
-        for emb_name in os.listdir(lookup_table_dirname):
-            if emb_var_name in emb_name:
-                emb_files.append(emb_name)
-
-        convert_program = Program()
-        global_block = convert_program.global_block()
-
-        emb_var = global_block.create_var(
-            name=emb_var.name,
-            shape=emb_var.shape,
-            dtype=emb_var.dtype,
-            type=core.VarDesc.VarType.SELECTED_ROWS,
-            persistable=True)
-        emb_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-
-        sums = []
-
-        for i, emb_file in enumerate(emb_files):
-            var_name = "{}_{}".format(emb_var.name, i)
-            param_var = global_block.create_var(
-                name=var_name,
-                shape=emb_var.shape,
-                dtype=emb_var.dtype,
-                type=core.VarDesc.VarType.SELECTED_ROWS,
-                persistable=True)
-            param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-            global_block.append_op(
-                type='load',
-                inputs={},
-                outputs={'Out': [param_var]},
-                attrs={
-                    'file_path': os.path.join(lookup_table_dirname, var_name)
-                })
-            sums.append(param_var)
-        global_block.append_op(
-            type='merge_sparse_lookup_table',
-            inputs={"X": sums},
-            outputs={'Out': emb_var},
-            attrs={})
-        global_block.append_op(
-            type='save',
-            inputs={"X": [emb_var]},
-            outputs={},
-            attrs={
-                'file_path': os.path.join(lookup_table_dirname, emb_var.name)
-            })
-        global_block.append_op(type='delete_var', inputs={'X': sums})
-        executor.run(convert_program)
-
-    if not os.path.isdir(dirname):
-        raise ValueError("There is no directory named '%s'", dirname)
-
-    if program:
-        if not isinstance(program, Program):
-            raise ValueError("program must be an instance of fluid.Program")
-    else:
-        local_model = os.path.join(dirname, model_filename)
-
-        with open(local_model, "rb") as f:
-            program_desc_str = f.read()
-
-        program = Program.parse_from_string(program_desc_str)
-
-        if not core._is_program_version_supported(program._version()):
-            raise ValueError("Unsupported program version: %d\n" %
-                             program._version())
-
-    _logger.info("Start Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-    _load_persistable_vars(executor, dirname, program, [lookup_table_var_name])
-    _load_lookup_table_vars(executor, dirname, program, [lookup_table_var_name])
-
-    _logger.info("Finish Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-    return program
-
-
-def get_inference_model(main_program, feeded_var_names, target_vars):
-    """
-    Prune the given `main_program` to build a new program especially for inference with distributed lookup table ,
-    and then add `feeded_vars` and `target_vars` in this program.
-
-    Args:
-        main_program(Program|None): The original program, which will be pruned to
-                                    build the inference model. If is set None,
-                                    the default main program will be used.
-                                    Default: None.
-        feeded_var_names(list[str]): Names of variables that need to be fed data
-                                     during inference.
-        target_vars(list[Variable]): Variables from which we can get inference
-                                     results.
-    Returns:
-        program(Program)
-
-    Raises:
-        ValueError: If `feed_var_names` is not a list of basestring.
-        ValueError: If `target_vars` is not a list of Variable.
-
-    """
-
-    def prepend_feed_ops(inference_program,
-                         feed_target_names,
-                         feed_holder_name='feed'):
-        if len(feed_target_names) == 0:
-            return
-
-        global_block = inference_program.global_block()
-
-        feed_var = global_block.create_var(
-            name=feed_holder_name,
-            type=core.VarDesc.VarType.FEED_MINIBATCH,
-            persistable=True)
-
-        for i, name in enumerate(feed_target_names):
-            out = global_block.var(name)
-            global_block._prepend_op(
-                type='feed',
-                inputs={'X': [feed_var]},
-                outputs={'Out': [out]},
-                attrs={'col': i})
-
-    def append_fetch_ops(inference_program,
-                         fetch_target_names,
-                         fetch_holder_name='fetch'):
-        global_block = inference_program.global_block()
-        fetch_var = global_block.create_var(
-            name=fetch_holder_name,
-            type=core.VarDesc.VarType.FETCH_LIST,
-            persistable=True)
-
-        for i, name in enumerate(fetch_target_names):
-            global_block.append_op(
-                type='fetch',
-                inputs={'X': [name]},
-                outputs={'Out': [fetch_var]},
-                attrs={'col': i})
-
-    origin_program = main_program.clone()
-    main_program = main_program.clone()
-    global_block = main_program.global_block()
-
-    need_to_remove_op_index = []
-    for i, op in enumerate(global_block.ops):
-        op.desc.set_is_target(False)
-        if op.type == "feed" or op.type == "fetch":
-            need_to_remove_op_index.append(i)
-
-    for index in need_to_remove_op_index[::-1]:
-        global_block._remove_op(index)
-
-    main_program.desc.flush()
-
-    main_program = main_program._prune(targets=target_vars)
-    main_program = main_program._inference_optimize(prune_read_op=True)
-
-    fetch_var_names = [v.name for v in target_vars]
-
-    prepend_feed_ops(main_program, feeded_var_names)
-    append_fetch_ops(main_program, fetch_var_names)
-
-    return main_program
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index d3dc26c946df45..9e931ad40c57a5 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -37,7 +37,10 @@
 try:
     if os.name == 'nt':
         third_lib_path = current_path + os.sep + '..' + os.sep + 'libs'
-        os.environ['path'] = third_lib_path + ';' + os.environ['path']
+        # Will load shared library from 'path' on windows
+        os.environ[
+            'path'] = current_path + ';' + third_lib_path + ';' + os.environ[
+                'path']
         sys.path.insert(0, third_lib_path)
         # Note: from python3.8, PATH will not take effect
         # https://github.com/python/cpython/pull/12302
@@ -270,6 +273,10 @@ def to_list(s):
         from .core_avx import _load_static_dict
         from .core_avx import _save_dygraph_dict
         from .core_avx import _load_dygraph_dict
+        from .core_avx import _save_lod_tensor
+        from .core_avx import _load_lod_tensor
+        from .core_avx import _save_selected_rows
+        from .core_avx import _load_selected_rows
         from .core_avx import _create_loaded_parameter
         from .core_avx import _cuda_synchronize
         from .core_avx import _promote_types_if_complex_exists
@@ -294,7 +301,7 @@ def to_list(s):
                 "WARNING: AVX is supported on local machine, but you have installed "
                 "paddlepaddle without avx core. Hence, no_avx core which has worse "
                 "preformance will be imported.\nYou could reinstall paddlepaddle by "
-                "'python -m pip install -U paddlepaddle-gpu[==version]' or rebuild "
+                "'python -m pip install --force-reinstall paddlepaddle-gpu[==version]' or rebuild "
                 "paddlepaddle WITH_AVX=ON to get better performance.\n"
                 "The original error is: %s\n" % cpt.get_exception_message(e))
             load_noavx = True
@@ -325,6 +332,10 @@ def to_list(s):
         from .core_noavx import _load_static_dict
         from .core_noavx import _save_dygraph_dict
         from .core_noavx import _load_dygraph_dict
+        from .core_noavx import _save_lod_tensor
+        from .core_noavx import _load_lod_tensor
+        from .core_noavx import _save_selected_rows
+        from .core_noavx import _load_selected_rows
         from .core_noavx import _create_loaded_parameter
         from .core_noavx import _cuda_synchronize
         from .core_noavx import _promote_types_if_complex_exists
@@ -342,12 +353,19 @@ def to_list(s):
             sys.stderr.write(
                 'Error: Can not import noavx core while this file exists: ' +
                 current_path + os.sep + 'core_noavx.' + core_suffix + '\n')
+        elif avx_supported():
+            sys.stderr.write(
+                "Error: AVX is support on your machine, but you have installed "
+                "paddlepaddle without avx core, you should reinstall paddlepaddle by "
+                "'python -m pip install --force-reinstall paddlepaddle-gpu[==version]\n"
+            )
         else:
             sys.stderr.write(
                 "Error: AVX is not support on your machine, but you have installed "
-                "paddlepaddle with avx core, you should reinstall paddlepaddle by "
-                "'python -m pip install -U paddlepaddle-gpu[==version] -f "
-                "https://paddlepaddle.org.cn/whl/stable_noavx.html'\n")
+                "paddlepaddle without no_avx core, you should reinstall paddlepaddle by "
+                "'python -m pip install --force-reinstall paddlepaddle-gpu[==version] -f "
+                "https://paddlepaddle.org.cn/whl/mkl/stable/noavx.html or "
+                "https://paddlepaddle.org.cn/whl/openblas/stable/noavx.html\n")
         raise e
 
 
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 167c7987c55d30..52ab83698592ab 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -35,7 +35,7 @@
 import paddle
 from .. import core, layers
 from ..framework import in_dygraph_mode
-from ..multiprocess_utils import _set_SIGCHLD_handler, MP_STATUS_CHECK_INTERVAL
+from ..multiprocess_utils import _set_SIGCHLD_handler, MP_STATUS_CHECK_INTERVAL, CleanupFuncRegistrar
 from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher
 from .batch_sampler import _InfiniteIterableSampler
 from .collate import default_collate_fn, default_convert_fn
@@ -125,6 +125,13 @@ def __init__(self, loader):
 
         self._init_thread()
 
+        # if user exit python program when dataloader is still
+        # iterating, resource may no release safely, so we
+        # add __del__ function to to CleanupFuncRegistrar
+        # to make sure __del__ is always called when program
+        # exit for resoure releasing safely
+        CleanupFuncRegistrar.register(self.__del__)
+
     def _init_thread(self):
         self._var_names = [v.name for v in self._feed_list]
         self._shapes = [v.shape for v in self._feed_list]
@@ -178,13 +185,16 @@ def _thread_loop(self, legacy_expected_place):
                 if not self._blocking_queue.push(array):
                     break
 
+                if self._thread_done_event.is_set():
+                    break
+
             self._blocking_queue.close()
-            self._thread = None
+            self._shutdown_thread()
         except StopIteration:
             self._blocking_queue.close()
         except Exception:
             self._blocking_queue.kill()
-            self._thread = None
+            self._shutdown_thread()
             logging.warning("DataLoader reader thread raised an exception.")
             six.reraise(*sys.exc_info())
 
@@ -216,6 +226,13 @@ def __next__(self):
             self._reader.shutdown()
             six.reraise(*sys.exc_info())
 
+    def _shutdown_thread(self):
+        if self._thread:
+            self._thread_done_event.set()
+            if self._thread is not threading.current_thread():
+                self._thread.join()
+                self._thread = None
+
     # python2 compatibility
     def next(self):
         return self.__next__()
@@ -225,6 +242,10 @@ def __del__(self):
         # need to release thread resources on unexpected exit
         if self._blocking_queue:
             self._blocking_queue.close()
+        # NOTE: blocking queue should be closed firstly for
+        # blocking queue read may hang and _thread_done_event
+        # cannot be checked
+        self._shutdown_thread()
 
 
 class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
@@ -266,6 +287,13 @@ def __init__(self, loader):
         self._init_thread()
         self._shutdown = False
 
+        # if user exit python program when dataloader is still
+        # iterating, resource may no release safely, so we
+        # add __del__ function to to CleanupFuncRegistrar
+        # to make sure __del__ is always called when program
+        # exit for resoure releasing safely
+        CleanupFuncRegistrar.register(self.__del__)
+
     def _init_workers(self):
         # multiprocess worker and indice queue list initial as empty
         self._workers = []
diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py
index e46083295d1ce1..3578e27cf02af1 100755
--- a/python/paddle/fluid/dataloader/dataset.py
+++ b/python/paddle/fluid/dataloader/dataset.py
@@ -97,10 +97,10 @@ class IterableDataset(Dataset):
         .. code-block:: python
 
             import numpy as np
-            from paddle.io import Dataset
+            from paddle.io import IterableDataset
             
             # define a random dataset
-            class RandomDataset(Dataset):
+            class RandomDataset(IterableDataset):
                 def __init__(self, num_samples):
                     self.num_samples = num_samples
             
@@ -233,7 +233,7 @@ class TensorDataset(Dataset):
     each sample by indexing tensors in the 1st dimension.
 
     Args:
-        tensors(list of Tensor): tensors with same shape in the 1st dimension.
+        tensors(list|tuple): A list/tuple of tensors with same shape in the 1st dimension.
 
     Returns:
         Dataset: a Dataset instance wrapping tensors.
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index 7bcd10a726949b..7fed27ee45978a 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -433,7 +433,10 @@ def _gen_worker_desc(self, trainer_desc):
         # cfg.program_desc.CopyFrom(program.program._get_desc())
         place = pipeline_opt["place"]
         place_id = pipeline_opt["place_id"]
-        assert isinstance(place, core.CUDAPlace)
+        if core.is_compiled_with_cuda():
+            assert isinstance(place, core.CUDAPlace)
+        elif core.is_compiled_with_npu():
+            assert isinstance(place, core.NPUPlace)
         cfg.place = cfg.CUDAPlace
         cfg.place_id = place_id
 
diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py
index 61b2bcad01d5b1..42033a0ada4ac6 100644
--- a/python/paddle/fluid/distributed/ps_instance.py
+++ b/python/paddle/fluid/distributed/ps_instance.py
@@ -24,7 +24,7 @@ class PaddlePSInstance(object):
             instance = PaddlePSInstance(1, 2)
     """
 
-    def __init__(self, server_worker_mode, proc_per_node):
+    def __init__(self, server_worker_mode=1, proc_per_node=2):
         self.dh = MPIHelper()
         self._rankid = self.dh.get_rank()
         self._server_worker_mode = server_worker_mode
diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py
index cf270ced3b7041..d66e33097833a5 100644
--- a/python/paddle/fluid/dygraph/__init__.py
+++ b/python/paddle/fluid/dygraph/__init__.py
@@ -58,6 +58,8 @@
 
 from .math_op_patch import monkey_patch_math_varbase
 
+from .inplace_utils import inplace_apis_in_dygraph_only
+
 __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
diff --git a/python/paddle/fluid/dygraph/container.py b/python/paddle/fluid/dygraph/container.py
index e80bc1245f9ce4..c7ea412fec1b77 100644
--- a/python/paddle/fluid/dygraph/container.py
+++ b/python/paddle/fluid/dygraph/container.py
@@ -29,7 +29,7 @@ class Sequential(Layer):
     The argument passed to the constructor can be iterable Layers or iterable name Layer pairs.
 
     Parameters:
-        *layers(tuple): Layers or iterable name Layer pairs.
+        layers(Layer|list|tuple): Layer or list/tuple of iterable name Layer pair.
 
     Examples:
         .. code-block:: python
@@ -59,7 +59,7 @@ class Sequential(Layer):
 
     def __init__(self, *layers):
         super(Sequential, self).__init__()
-        if len(layers) > 0 and isinstance(layers[0], tuple):
+        if len(layers) > 0 and isinstance(layers[0], (list, tuple)):
             for name, layer in layers:
                 self.add_sublayer(name, layer)
         else:
@@ -69,6 +69,8 @@ def __init__(self, *layers):
     def __getitem__(self, name):
         if isinstance(name, slice):
             return self.__class__(*(list(self._sub_layers.values())[name]))
+        elif isinstance(name, str):
+            return self._sub_layers[name]
         else:
             if name >= len(self._sub_layers):
                 raise IndexError('index {} is out of range'.format(name))
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
index 198c2920eec7fd..5ea1fdfac0928a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py
@@ -33,10 +33,11 @@ def __init__(self, wrapper_root):
         self.root = wrapper_root.node
         self.class_node_dict = {}
 
-        self.name_to_tensor_shape = {}
-
     def transform(self):
+        to_tensor_transformer = ToTensorTransformer(self.root)
+        to_tensor_transformer.transform()
         self.visit(self.root)
+
         return self.wrapper_root
 
     def visit_Assign(self, node):
@@ -62,11 +63,6 @@ def visit_Expr(self, node):
 
     def _visit_Call(self, node):
         assert isinstance(node, gast.Call)
-        # Replace API `to_variable` with `fluid.layers.assign`
-        if is_to_variable(node):
-            node = to_assign_node(node)
-            return node
-
         func_name = astor.to_source(gast.gast_to_ast(node.func))
 
         if self._is_dygraph_forward(func_name):
@@ -102,6 +98,29 @@ def _update_class_node_dict(self, node):
         return False
 
 
+class ToTensorTransformer(gast.NodeTransformer):
+    """
+    Class to transform paddle.to_tensor and paddle.to_variable to paddle.assign
+    """
+
+    def __init__(self, node):
+        assert isinstance(
+            node, gast.AST
+        ), "Input non-gast.AST node for the initialization of ToTensorTransformer."
+        self.root = node
+
+    def transform(self):
+        self.visit(self.root)
+        return self.root
+
+    def visit_Call(self, node):
+        assert isinstance(node, gast.Call)
+        if is_to_variable(node):
+            node = to_assign_node(node)
+        self.generic_visit(node)
+        return node
+
+
 def is_to_variable(node):
     assert isinstance(node, gast.Call)
     api_name = utils.ast_to_source_code(node.func).strip()
diff --git a/python/paddle/fluid/dygraph/inplace_utils.py b/python/paddle/fluid/dygraph/inplace_utils.py
new file mode 100644
index 00000000000000..c1f7ef9b691c09
--- /dev/null
+++ b/python/paddle/fluid/dygraph/inplace_utils.py
@@ -0,0 +1,38 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..wrapped_decorator import wrap_decorator
+from ..framework import in_dygraph_mode
+import warnings
+import paddle
+
+
+# NOTE(pangyoki): The Inplace APIs with underline(`_`) is only valid for the method of calling `core.ops`
+# in dygraph mode. If static mode is used, the inplace mechanism will not be used, and the static method
+# of the original API will be called.
+def _inplace_apis_in_dygraph_only_(func):
+    def __impl__(*args, **kwargs):
+        if not in_dygraph_mode():
+            origin_api_name = func.__name__[:-1]
+            warnings.warn(
+                "In static mode, {}() is the same as {}() and does not perform inplace operation.".
+                format(func.__name__, origin_api_name))
+            origin_func = "{}.{}".format(func.__module__, origin_api_name)
+            return eval(origin_func)(*args, **kwargs)
+        return func(*args, **kwargs)
+
+    return __impl__
+
+
+inplace_apis_in_dygraph_only = wrap_decorator(_inplace_apis_in_dygraph_only_)
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index af4ba16ee8f64c..33eb16f1b2b44c 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -413,6 +413,23 @@ def _append_backward_desc(self, infer_program_desc):
         # Therefore, in order to reuse the method of backward.py, build the program here.
         program = _build_program_by_desc(program_desc_copy)
 
+        # 3. Add the outputs which is only used for training and not saved in
+        # inference program.
+        for block_idx in six.moves.range(program.num_blocks):
+            block = program.block(block_idx)
+            for op in block.ops:
+                if op.type == "batch_norm":
+                    if "ReserveSpace" not in op.output_names or len(
+                            op.output("ReserveSpace")) == 0:
+                        reserve_space = block.create_var(
+                            name=unique_name.generate_with_ignorable_key(
+                                ".".join(["reserve_space", 'tmp'])),
+                            dtype=block.var(op.input("X")[0]).dtype,
+                            type=core.VarDesc.VarType.LOD_TENSOR,
+                            persistable=False,
+                            stop_gradient=True)
+                        op.desc.set_output("ReserveSpace", [reserve_space.name])
+
         targets = []
         for out in self._output_descs:
             targets.append(program.global_block().var(out.name()))
@@ -633,6 +650,7 @@ def _construct_params_and_buffers(model_path,
                                   append_suffix=True):
     var_info_filename = str(params_filename) + ".info"
     var_info_path = os.path.join(model_path, var_info_filename)
+    params_path = os.path.join(model_path, str(params_filename))
 
     if os.path.exists(var_info_path):
         var_dict = _load_persistable_vars(model_path, var_info_path,
@@ -654,6 +672,9 @@ def _construct_params_and_buffers(model_path,
             var_dict.update(
                 _load_persistable_vars(model_path, var_info_path, programs[
                     func_name], file_name))
+    elif params_filename is not None and not os.path.exists(params_path):
+        # When saving XX, there is only '*.pdmodel'
+        return dict()
     else:
         var_dict = _load_persistable_vars_by_program(
             model_path, programs['forward'], params_filename)
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 40ab19184c9c8c..352a377fa3adc5 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -19,6 +19,7 @@
 import warnings
 import functools
 from collections import OrderedDict
+import inspect
 
 import six
 import paddle
@@ -168,7 +169,7 @@ def declarative(function=None, input_spec=None):
 
     Args:
         function (callable): callable imperative function.
-        input_spec(list[InputSpec]): list of InputSpec to specific the shape/dtype/name
+        input_spec(list[InputSpec]|tuple[InputSpec]): list/tuple of InputSpec to specific the shape/dtype/name
             information of each input Tensor.
 
     Returns:
@@ -506,7 +507,7 @@ def _build_load_path_and_config(path, config):
 @switch_to_static_graph
 def save(layer, path, input_spec=None, **configs):
     """
-    Saves input Layer as ``paddle.jit.TranslatedLayer``
+    Saves input Layer or function as ``paddle.jit.TranslatedLayer``
     format model, which can be used for inference or fine-tuning after loading.
 
     It will save the translated program and all related persistable
@@ -522,10 +523,14 @@ def save(layer, path, input_spec=None, **configs):
       - ``paddle.static.load_inference_model``
       - Other C++ inference APIs
 
+    .. note::
+        When using ``paddle.jit.save`` to save a function, parameters will not be saved. If you have to 
+        save the parameter, please pass the Layer containing function and parameter to ``paddle.jit.save``.
+
     Args:
-        layer (Layer): The Layer to be saved.
+        layer (Layer|function): The Layer or function to be saved.
         path (str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``.
-        input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model's forward
+        input_spec (list[InputSpec|Tensor]|tuple[InputSpec|Tensor], optional): Describes the input of the saved model's forward
             method, which can be described by InputSpec or example Tensor. If None, all input variables of
             the original Layer's forward method would be the inputs of the saved model. Default None.
         **configs (dict, optional): Other save configuration options for compatibility. We do not
@@ -543,6 +548,7 @@ def save(layer, path, input_spec=None, **configs):
     Examples:
         .. code-block:: python
 
+            # example 1: save layer
             import numpy as np
             import paddle
             import paddle.nn as nn
@@ -609,6 +615,28 @@ def train(layer, loader, loss_fn, opt):
             # save
             path = "example_model/linear"
             paddle.jit.save(layer, path)
+
+            # example 2: save function
+            import paddle
+            from paddle.static import InputSpec
+
+
+            def save_function():
+                @paddle.jit.to_static
+                def fun(inputs):
+                    return paddle.tanh(inputs)
+
+                path = 'test_jit_save_load_function_1/func'
+                inps = paddle.rand([3, 6])
+                origin = fun(inps)
+
+                paddle.jit.save(fun, path)
+                load_func = paddle.jit.load(path)
+
+                load_result = load_func(inps)
+                print((load_result - origin).abs().max() < 1e-10)
+                
+            save_function()
     """
 
     # 1. input build & check
@@ -617,9 +645,11 @@ def train(layer, loader, loss_fn, opt):
         raise RuntimeError(
             "The paddle.jit.save doesn't work when setting ProgramTranslator.enable to False."
         )
-    if not isinstance(layer, Layer):
+
+    if not (isinstance(layer, Layer) or inspect.isfunction(layer) or isinstance(
+            layer, StaticFunction)):
         raise TypeError(
-            "The input layer of paddle.jit.save should be 'Layer', but received layer type is %s."
+            "The input of paddle.jit.save should be 'Layer' or 'Function', but received input type is %s."
             % type(layer))
 
     # NOTE(chenweihang): If the input layer be wrapped by DataParallel,
@@ -647,14 +677,16 @@ def train(layer, loader, loss_fn, opt):
     # avoid change user given input_spec
     inner_input_spec = None
     if input_spec is not None:
-        for attr_func in dir(inner_layer):
-            static_func = getattr(inner_layer, attr_func, None)
-            if isinstance(static_func,
-                          StaticFunction) and 'forward' != attr_func:
-                raise ValueError(
-                    "If there are static functions other than 'forward' that need to be saved, the input 'input_spec' should be None, but received the type of 'input_spec' is %s."
-                    % type(input_spec))
-        if not isinstance(input_spec, list):
+        if isinstance(layer, Layer):
+            for attr_func in dir(inner_layer):
+                static_func = getattr(inner_layer, attr_func, None)
+                if isinstance(static_func,
+                              StaticFunction) and 'forward' != attr_func:
+                    raise ValueError(
+                        "If there are static functions other than 'forward' that need to be saved, the input 'input_spec' should be None, but received the type of 'input_spec' is %s."
+                        % type(input_spec))
+
+        if not isinstance(input_spec, (list, tuple)):
             raise TypeError(
                 "The input input_spec should be 'list', but received input_spec's type is %s."
                 % type(input_spec))
@@ -674,29 +706,74 @@ def train(layer, loader, loss_fn, opt):
     configs = _parse_save_configs(configs)
     scope = core.Scope()
     extra_var_info = dict()
-    for attr_func in dir(inner_layer):
-        static_func = getattr(inner_layer, attr_func, None)
-        if isinstance(static_func, StaticFunction):
-            concrete_program = static_func.concrete_program_specify_input_spec(
-                inner_input_spec)
-        elif 'forward' == attr_func:
-            # transform in jit.save, if input_spec is incomplete, declarative will throw error
-            # inner_input_spec is list[InputSpec], it should be packed with same sturcture
-            # as original input_spec here.
-            if inner_input_spec:
-                inner_input_spec = pack_sequence_as(input_spec,
-                                                    inner_input_spec)
-            static_forward = declarative(
-                inner_layer.forward, input_spec=inner_input_spec)
-            concrete_program = static_forward.concrete_program
-            # the input_spec has been used in declarative, which is equal to
-            # @declarative with input_spec and jit.save without input_spec,
-            # avoid needless warning
-            inner_input_spec = None
+    if isinstance(layer, Layer):
+        functions = dir(inner_layer)
+    else:
+        # layer is function
+        functions = [layer, ]
+    for attr_func in functions:
+        if isinstance(layer, Layer):
+            static_func = getattr(inner_layer, attr_func, None)
+            if isinstance(static_func, StaticFunction):
+                concrete_program = static_func.concrete_program_specify_input_spec(
+                    inner_input_spec)
+            elif 'forward' == attr_func:
+                # transform in jit.save, if input_spec is incomplete, declarative will throw error
+                # inner_input_spec is list[InputSpec], it should be packed with same sturcture
+                # as original input_spec here.
+                if inner_input_spec:
+                    inner_input_spec = pack_sequence_as(input_spec,
+                                                        inner_input_spec)
+                static_forward = declarative(
+                    inner_layer.forward, input_spec=inner_input_spec)
+                concrete_program = static_forward.concrete_program
+                # the input_spec has been used in declarative, which is equal to
+                # @declarative with input_spec and jit.save without input_spec,
+                # avoid needless warning
+                inner_input_spec = None
+            else:
+                continue
+
+            # NOTE(chenweihang): we maintain the mapping of variable name to
+            # structured name, the buffer variable (non-persistable)
+            # saved to inference program may not need by dygraph Layer,
+            # we only record the state_dict variable's structured name
+            state_names_dict = dict()
+            for structured_name, var in six.iteritems(inner_layer.state_dict()):
+                state_names_dict[var.name] = structured_name
+
+            # 3. share parameters from Layer to scope & record var info
+            for param_or_buffer in concrete_program.parameters:
+                # share to scope
+                param_or_buffer_tensor = scope.var(
+                    param_or_buffer.name).get_tensor()
+                src_tensor = param_or_buffer.value().get_tensor()
+                param_or_buffer_tensor._share_data_with(src_tensor)
+                # record var info
+                if param_or_buffer.name not in extra_var_info:
+                    extra_info_dict = dict()
+                    if param_or_buffer.name in state_names_dict:
+                        extra_info_dict['structured_name'] = state_names_dict[
+                            param_or_buffer.name]
+                    extra_info_dict[
+                        'stop_gradient'] = param_or_buffer.stop_gradient
+                    if isinstance(param_or_buffer, ParamBase):
+                        extra_info_dict['trainable'] = param_or_buffer.trainable
+                    extra_var_info[param_or_buffer.name] = extra_info_dict
         else:
-            continue
-
-        # 3. build input & output of save_infernece_model
+            # When layer is a function
+            if isinstance(attr_func, StaticFunction):
+                concrete_program = attr_func.concrete_program_specify_input_spec(
+                    inner_input_spec)
+            else:
+                if inner_input_spec:
+                    inner_input_spec = pack_sequence_as(input_spec,
+                                                        inner_input_spec)
+                static_function = declarative(
+                    attr_func, input_spec=inner_input_spec)
+                concrete_program = static_function.concrete_program
+
+        # 4. build input & output of save_infernece_model
         # NOTE(chenweihang): [ Get input variables name ]
         # There are two cases, whether to prune the inputs or not
         # - not prune inputs (recommend):
@@ -715,32 +792,6 @@ def train(layer, loader, loss_fn, opt):
         output_vars = _get_output_vars(concrete_program.outputs,
                                        configs.output_spec)
 
-        # NOTE(chenweihang): we maintain the mapping of variable name to
-        # structured name, the buffer variable (non-persistable)
-        # saved to inference program may not need by dygraph Layer,
-        # we only record the state_dict variable's structured name
-        state_names_dict = dict()
-        for structured_name, var in six.iteritems(inner_layer.state_dict()):
-            state_names_dict[var.name] = structured_name
-
-        # 4. share parameters from Layer to scope & record var info
-        for param_or_buffer in concrete_program.parameters:
-            # share to scope
-            param_or_buffer_tensor = scope.var(param_or_buffer.name).get_tensor(
-            )
-            src_tensor = param_or_buffer.value().get_tensor()
-            param_or_buffer_tensor._share_data_with(src_tensor)
-            # record var info
-            if param_or_buffer.name not in extra_var_info:
-                extra_info_dict = dict()
-                if param_or_buffer.name in state_names_dict:
-                    extra_info_dict['structured_name'] = state_names_dict[
-                        param_or_buffer.name]
-                extra_info_dict['stop_gradient'] = param_or_buffer.stop_gradient
-                if isinstance(param_or_buffer, ParamBase):
-                    extra_info_dict['trainable'] = param_or_buffer.trainable
-                extra_var_info[param_or_buffer.name] = extra_info_dict
-
         # 5. save inference model
         from paddle.fluid.io import save_inference_model
 
@@ -748,7 +799,7 @@ def train(layer, loader, loss_fn, opt):
         model_path = dirname
         # NOTE(chenweihang): because prefix contains model and params filename,
         # so we don't support set model_filename & params_filename
-        if 'forward' == attr_func:
+        if 'forward' == attr_func or not isinstance(layer, Layer):
             model_filename = file_prefix + INFER_MODEL_SUFFIX
             params_filename = file_prefix + INFER_PARAMS_SUFFIX
         else:
@@ -782,10 +833,11 @@ def train(layer, loader, loss_fn, opt):
     # but we can save these information in `jit.save` without changing the original
     # storage to improve user experience. So we save extra information into
     # file `***.pdiparams.info`
-    with scope_guard(scope):
-        extra_var_info_path = path + INFER_PARAMS_INFO_SUFFIX
-        with open(extra_var_info_path, 'wb') as f:
-            pickle.dump(extra_var_info, f, protocol=2)
+    if isinstance(layer, Layer) and extra_var_info:
+        with scope_guard(scope):
+            extra_var_info_path = path + INFER_PARAMS_INFO_SUFFIX
+            with open(extra_var_info_path, 'wb') as f:
+                pickle.dump(extra_var_info, f, protocol=2)
 
 
 @dygraph_only
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index b4959764742213..18dfff434a2aaf 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -516,9 +516,6 @@ def forward(self, input):
     def parameters(self, include_sublayers=True):
         """Returns a list of all Parameters from current layer and its sub-layers.
 
-        Parameters:
-            include_sublayers(bool, optional): Whether include the parameters of sublayers. If True, also include the parameters from sublayers. Default: True
-
         Returns:
             list of Tensor : a list of Parameters.
 
@@ -588,11 +585,11 @@ def named_children(self):
                 memo.add(layer)
                 yield name, layer
 
-    def sublayers(self, include_sublayers=True):
+    def sublayers(self, include_self=False):
         """Returns a list of sub layers.
 
         Parameters:
-            include_sublayers(bool, optional): Whether return the sublayers of sublayers. If True, also include the sublayers of sublayers. Default: True
+            include_self(bool, optional): Whether return self as sublayers. Default: False
 
         Returns:
             list of Layer : a list of sub layers.
@@ -619,8 +616,7 @@ def forward(self, input):
         """
         ret = [
             layer
-            for _, layer in self.named_sublayers(
-                include_sublayers=include_sublayers)
+            for _, layer in self.named_sublayers(include_self=include_self)
         ]
         return ret
 
@@ -651,8 +647,7 @@ def named_parameters(self, prefix='', include_sublayers=True):
         params_set = set()
         named_sublayers = self.named_sublayers(
             prefix=prefix,
-            include_sublayers=include_sublayers,
-            include_self=True)
+            include_self=True) if include_sublayers else zip([prefix], [self])
         for layer_prefix, sublayer in named_sublayers:
             params = sublayer._parameters.items()
             for key, param in params:
@@ -662,18 +657,13 @@ def named_parameters(self, prefix='', include_sublayers=True):
                 name = layer_prefix + ('.' if layer_prefix else '') + key
                 yield name, param
 
-    def named_sublayers(self,
-                        prefix='',
-                        include_sublayers=True,
-                        include_self=False,
-                        layers_set=None):
+    def named_sublayers(self, prefix='', include_self=False, layers_set=None):
         """
         Returns an iterator over all sublayers in the Layer, yielding tuple of name and sublayer.
         The duplicate sublayer will only be yielded once.
 
         Parameters:
             prefix(str, optional): Prefix to prepend to all parameter names. Default: ''.
-            include_sublayers(bool, optional): Whether include the sublayers. Default: True.
             include_self(bool, optional): Whether include the Layer itself. Default: False.
             layers_set(set, optioanl): The set to record duplicate sublayers. Default: None.
 
@@ -697,17 +687,14 @@ def named_sublayers(self,
         if include_self and self not in layers_set:
             layers_set.add(self)
             yield prefix, self
-        if include_sublayers:
-            for key, layer in self._sub_layers.items():
-                if layer is None:
-                    continue
-                layer_prefix = prefix + ('.' if prefix else '') + key
-                for p, l in layer.named_sublayers(
-                        prefix=layer_prefix,
-                        include_sublayers=include_sublayers,
-                        include_self=True,
-                        layers_set=layers_set):
-                    yield p, l
+        for key, layer in self._sub_layers.items():
+            if layer is None:
+                continue
+            layer_prefix = prefix + ('.' if prefix else '') + key
+            for p, l in layer.named_sublayers(
+                    prefix=layer_prefix, include_self=True,
+                    layers_set=layers_set):
+                yield p, l
 
     def register_buffer(self, name, tensor, persistable=True):
         """
@@ -844,8 +831,7 @@ def named_buffers(self, prefix='', include_sublayers=True):
         buffers_set = set()
         named_sublayers = self.named_sublayers(
             prefix=prefix,
-            include_sublayers=include_sublayers,
-            include_self=True)
+            include_self=True) if include_sublayers else zip([prefix], [self])
         for layer_prefix, sublayer in named_sublayers:
             buffers = sublayer._buffers.items()
             for key, buffer in buffers:
@@ -1263,16 +1249,12 @@ def state_dict(self,
         return destination
 
     @framework.deprecate_stat_dict
-    def set_state_dict(self,
-                       state_dict,
-                       include_sublayers=True,
-                       use_structured_name=True):
+    def set_state_dict(self, state_dict, use_structured_name=True):
         '''
         Set parameters and persistable buffers from state_dict. All the parameters and buffers will be reset by the tensor in the state_dict
 
         Parameters:
             state_dict(dict) : Dict contains all the parameters and persistable buffers.
-            include_sublayers(bool, optional) : If true, also include the parameters and peresistable buffers from sublayers. Default: True
             use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter or buffer name as key. 
                                                   Default: True
         Returns:
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 41cce6a0858a6e..e39fc3e23fe564 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -325,13 +325,7 @@ def __impl__(self, other_var):
     else:
         import paddle.tensor
         # Tensor method from module paddle.tensor
-        tensor_methods = paddle.tensor.linalg.__all__ + \
-                         paddle.tensor.math.__all__ + \
-                         paddle.tensor.logic.__all__ + \
-                         paddle.tensor.manipulation.__all__ + \
-                         paddle.tensor.search.__all__ + \
-                         paddle.tensor.stat.__all__ + \
-                         paddle.tensor.attribute.__all__
+        tensor_methods = paddle.tensor.tensor_method_func
         for method_name in tensor_methods:
             if hasattr(core.VarBase, method_name): continue
             method_impl = getattr(paddle.tensor, method_name, None)
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index b80621e21f1c5c..ca5e5606e432b0 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -25,6 +25,7 @@
 from paddle.fluid.dygraph import to_variable, no_grad
 from paddle.utils import deprecated
 from ..layers import collective
+from paddle.fluid.dygraph import base as imperative_base
 import warnings
 import paddle
 import itertools
@@ -320,6 +321,62 @@ def scale_loss(loss):
     return scaled_loss
 
 
+@imperative_base.no_grad
+@framework.dygraph_only
+def build_groups(vars, group_size):
+    group_idx = 0
+    memory_counter = 0
+    var_groups = OrderedDict()
+    dtype = vars[0].dtype
+
+    for var in vars:
+        bytes = np.prod(var.shape) * core.size_of_dtype(var.dtype)
+        if memory_counter < group_size and dtype == var.dtype:
+            memory_counter += bytes
+        else:
+            memory_counter = bytes
+            dtype = var.dtype
+            group_idx += 1
+        var_groups.setdefault(group_idx, []).append(var)
+    return _coalesce_tensors(var_groups)
+
+
+@imperative_base.no_grad
+@framework.dygraph_only
+def sync_params_buffers(model,
+                        comm_group=None,
+                        src_rank=0,
+                        is_model_parallel=False):
+    model_vars = []
+    for _, param in model.state_dict().items():
+        if not isinstance(param, core.VarBase):
+            raise TypeError("The data type of '%s' must be Varbase" %
+                            param.name)
+        # is_distributed param not need to sync when in mp mode
+        if is_model_parallel and param.is_distributed:
+            continue
+
+        model_vars.append(param.detach())
+    if len(model_vars) == 0:
+        return
+
+    # group size is 128M
+    coalesced_vars = build_groups(model_vars, 128 * 1024 * 1024)
+
+    for coalesced_var, _, _ in coalesced_vars:
+        paddle.distributed.broadcast(
+            coalesced_var, src=src_rank, group=comm_group, use_calc_stream=True)
+
+    for coalesced_var, origin_vars, var_shapes in coalesced_vars:
+        var_len = [np.prod(v_shape) for v_shape in var_shapes]
+        paddle.fluid.framework._dygraph_tracer().trace_op(
+            type='split',
+            inputs={'X': coalesced_var},
+            outputs={'Out': origin_vars},
+            attrs={'sections': var_len,
+                   'axis': 0})
+
+
 class DataParallel(layers.Layer):
     """
     Run the dygraph module with data parallelism.
@@ -443,7 +500,7 @@ def __init__(self,
             # TODO(liuyuhui) Currently not support xpu. xpu is 
             # still broadcasting parameters when calling layer
             if not paddle.is_compiled_with_xpu():
-                self._sync_params_buffers()
+                sync_params_buffers(self._layers)
 
             self.comm_buffer_size = int(comm_buffer_size * 1024 * 1024)
             # NOTE(shenliang03): We can set environment variables to control 
@@ -516,46 +573,6 @@ def _find_varbase(self, obj):
             return itertools.chain(*map(self._find_varbase, obj.values()))
         return []
 
-    def _sync_params_buffers(self):
-        model_vars = []
-        for _, param in self._layers.state_dict().items():
-            if not isinstance(param, core.VarBase):
-                raise TypeError("The data type of '%s' must be Varbase" %
-                                param.name)
-            model_vars.append(param.detach())
-        if len(model_vars) == 0:
-            return
-
-        mega_bytes = 128 * 1024 * 1024
-        group_idx = 0
-        memory_counter = 0
-        var_groups = OrderedDict()
-        dtype = model_vars[0].dtype
-
-        for var in model_vars:
-            bytes = np.prod(var.shape) * core.size_of_dtype(var.dtype)
-            if memory_counter < mega_bytes and dtype == var.dtype:
-                memory_counter += bytes
-            else:
-                memory_counter = 0
-                dtype = var.dtype
-                group_idx += 1
-            var_groups.setdefault(group_idx, []).append(var)
-
-        coalesced_vars = _coalesce_tensors(var_groups)
-
-        for coalesced_var, _, _ in coalesced_vars:
-            collective._broadcast(coalesced_var, root=0, sync_mode=True)
-
-        for coalesced_var, origin_vars, var_shapes in coalesced_vars:
-            var_len = [np.prod(v_shape) for v_shape in var_shapes]
-            framework._dygraph_tracer().trace_op(
-                type='split',
-                inputs={'X': coalesced_var},
-                outputs={'Out': origin_vars},
-                attrs={'sections': var_len,
-                       'axis': 0})
-
     def forward(self, *inputs, **kwargs):
         outputs = self._layers(*inputs, **kwargs)
         if self._strategy.nranks > 1 and framework._dygraph_tracer()._has_grad:
@@ -621,16 +638,12 @@ def state_dict(self,
             structured_name_prefix=structured_name_prefix)
 
     @framework.deprecate_stat_dict
-    def set_state_dict(self,
-                       state_dict,
-                       include_sublayers=True,
-                       use_structured_name=True):
+    def set_state_dict(self, state_dict, use_structured_name=True):
         '''
         Set parameters and persistable buffers from state_dict. All the parameters and buffers will be reset by the tensor in the state_dict
 
         Parameters:
             state_dict(dict) : Dict contains all the parameters and persistable buffers.
-            include_sublayers(bool, optional) : If true, also include the parameters and peresistable buffers from sublayers. Default: True
             use_structured_name(bool, optional) : If true, use structured name as key, otherwise, use parameter or buffer name as key. 
                                                   Default: True
         Returns:
@@ -656,9 +669,7 @@ def set_state_dict(self,
         '''
 
         self._layers.set_state_dict(
-            state_dict,
-            include_sublayers=include_sublayers,
-            use_structured_name=use_structured_name)
+            state_dict, use_structured_name=use_structured_name)
 
     # [aliases] Compatible with old method names
     set_dict = set_state_dict
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index ac594709867d1c..37900b7880a359 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -21,11 +21,12 @@
 from .. import framework
 from .. import core
 from .. import unique_name
-from ..framework import Variable, Parameter, ParamBase
+from ..framework import Variable, Parameter, ParamBase, _getitem_impl_
 from .base import switch_to_static_graph
 from .math_op_patch import monkey_patch_math_varbase
 from .parallel import scale_loss
 from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE
+import paddle.utils.deprecated as deprecated
 
 
 class TensorHookRemoveHelper(object):
@@ -238,8 +239,17 @@ def backward(self, grad_tensor=None, retain_graph=False):
                 "Variable.backward() is only available in DyGraph mode")
 
     @framework.dygraph_only
+    @deprecated(
+        since="2.1.0",
+        level=1,
+        reason="Please use tensor.grad, which returns the tensor value of the gradient."
+    )
     def gradient(self):
         """
+        .. warning::
+          This API will be deprecated in the future, it is recommended to use
+          :code:`x.grad` which returns the tensor value of the gradient.
+
         Get the Gradient of Current Tensor.
 
         Returns:
@@ -253,7 +263,7 @@ def gradient(self):
                 x = paddle.to_tensor(5., stop_gradient=False)
                 y = paddle.pow(x, 4.0)
                 y.backward()
-                print("grad of x: {}".format(x.grad))
+                print("grad of x: {}".format(x.gradient()))
                 # [500.]
 
         """
@@ -337,10 +347,31 @@ def double_hook_fn(grad):
     @property
     def grad(self):
         """
-        The alias of gradient().
-        """
+        .. warning::
+          This API will return the tensor value of the gradient. If you want 
+          to get the numpy value of the gradient, you can use :code:`x.grad.numpy()`.
 
-        return self.gradient()
+        Get the Gradient of Current Tensor.
+
+        Returns:
+            Tensor: the gradient of current Tensor
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                x = paddle.to_tensor(5., stop_gradient=False)
+                y = paddle.pow(x, 4.0)
+                y.backward()
+                print("grad of x: {}".format(x.grad))
+                # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, [500.])
+
+        """
+        msg = "tensor.grad will return the tensor value of the gradient."
+        warning_msg = "\033[93m\nWarning:\n%s \033[0m" % (msg)
+        warnings.warn(warning_msg)
+        return self._grad_ivar()
 
     def clear_grad(self):
         """
@@ -348,6 +379,49 @@ def clear_grad(self):
         """
         self.clear_gradient()
 
+    def item(self, *args):
+        """
+        Convert one element Tensor to a Python scalar.
+
+        Args:
+            *args(int): The input coordinates. If it's single int, the data in the corresponding order of flattened Tensor will be returned.
+                Default: None, and it must be in the case where Tensor has only one element.
+
+        Returns(Python scalar): A Python scalar, whose dtype is corresponds to the dtype of Tensor.
+
+        Raises:
+            ValueError: If the Tensor has more than one element, there must be coordinates.
+        
+        Examples:
+            .. code-block:: python
+
+                import paddle
+
+                x = paddle.to_tensor(1)
+                print(x.item())             #1
+                print(type(x.item()))       #<class 'int'>
+
+                x = paddle.to_tensor(1.0)
+                print(x.item())             #1.0
+                print(type(x.item()))       #<class 'float'>
+
+                x = paddle.to_tensor(True)
+                print(x.item())             #True
+                print(type(x.item()))       #<class 'bool'>
+
+                x = paddle.to_tensor(1+1j)
+                print(x.item())             #(1+1j)
+                print(type(x.item()))       #<class 'complex'>
+
+                x = paddle.to_tensor([[1.1, 2.2, 3.3]])
+                print(x.item(2))            #3.3
+                print(x.item(0, 2))         #3.3
+
+                x = paddle.to_tensor([1, 2])
+                x.item()               #ValueError: only one element tensor can be converted to Python scalar when no input coordinates.
+        """
+        return self._getitem_from_offset(*args).item()
+
     @property
     def inplace_version(self):
         """
@@ -434,6 +508,57 @@ def __nonzero__(self):
     def __bool__(self):
         return self.__nonzero__()
 
+    def __array__(self, dtype=None):
+        """
+        Returns a numpy array shows the value of current Tensor.
+        
+        Returns:
+            ndarray: The numpy value of current Tensor.
+
+        Returns type:
+            ndarray: dtype is same as current Tensor
+
+        Examples:
+            .. code-block:: python
+
+                import paddle
+                import numpy as np
+                x = paddle.randn([2, 2])
+                x_array = np.array(x)
+
+                print(type(x_array))      #<class 'numpy.ndarray'>
+                print(x_array.shape)      #(2, 2)
+        """
+        array = self.numpy()
+        if dtype:
+            array = array.astype(dtype)
+        return array
+
+    def __getitem__(self, item):
+        def contain_tensor(item):
+            if not isinstance(item, tuple):
+                item = [item]
+
+            for slice_item in item:
+                if isinstance(slice_item, slice):
+                    if isinstance(slice_item.start, Variable)  \
+                        or isinstance(slice_item.stop, Variable) \
+                           or isinstance(slice_item.step, Variable):
+                        return True
+                else:
+                    if isinstance(slice_item, Variable):
+                        return True
+            return False
+
+        if contain_tensor(item):
+            # 1. Call _getitem_impl_ when item contains tensor.
+            # Why not call a c++ function ? Because item can't be parsed when it contains tensor.
+            return _getitem_impl_(self, item)
+
+        else:
+            # 2. Call c++ func getitem_index_not_tensor to speedup.
+            return self._getitem_index_not_tensor(item)
+
     for method_name, method in (
         ("__bool__", __bool__), ("__nonzero__", __nonzero__),
         ("_to_static_var", _to_static_var), ("set_value", set_value),
@@ -442,7 +567,8 @@ def __bool__(self):
         ("gradient", gradient), ("register_hook", register_hook),
         ("__str__", __str__), ("__repr__", __str__),
         ("__deepcopy__", __deepcopy__), ("__module__", "paddle"),
-        ("__name__", "Tensor")):
+        ("__name__", "Tensor"), ("__array__", __array__),
+        ("__getitem__", __getitem__), ("item", item)):
         setattr(core.VarBase, method_name, method)
 
     # NOTE(zhiqiu): pybind11 will set a default __str__ method of enum class.
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 76bc68f24d2fef..62a9c42ee0a61c 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1451,8 +1451,12 @@ def _run_from_dataset(self,
             for var in program.global_block().vars.values():
                 if var.is_data:
                     data_vars.append(var)
-            dataset = paddle.fluid.DatasetFactory().create_dataset(
-                'FileInstantDataset')
+            if core.is_compiled_with_npu():
+                dataset = paddle.fluid.DatasetFactory().create_dataset(
+                    'InMemoryDataset')
+            else:
+                dataset = paddle.fluid.DatasetFactory().create_dataset(
+                    'FileInstantDataset')
             dataset.set_batch_size(1)
             dataset.set_thread(1)
             dataset.set_filelist(['None'])
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 5845a2c78ecf9d..2eac5adcf226c1 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -246,11 +246,11 @@ def __impl__(*args, **kwargs):
 def _fake_interface_only_(func):
     def __impl__(*args, **kwargs):
         raise AssertionError(
-            "'%s' should be called by imperative Varible in imperative mode, please run it in dygraph "
-            "mode. You can turn off paddle.enable_static() if you are in static mode, or turn off "
-            "ProgramTranslator if you are using @paddle.jit.to_static. If you have to run ProgramTranslator, "
-            "please use other API to replace '%s'" % (func.__name__,
-                                                      func.__name__))
+            "'%s' only can be called by `paddle.Tensor` in dynamic graph mode. Suggestions:\n"
+            "  1. If you are in static graph mode, you can switch to dynamic graph mode by turning off `paddle.enable_static()` or calling `paddle.disable_static()`.\n"
+            "  2. If you are using `@paddle.jit.to_static`, you can turn off ProgramTranslator by calling `paddle.jit.ProgramTranslator().enable(False)`. "
+            "If you have to translate dynamic graph to static graph, please use other API to replace '%s'."
+            % (func.__name__, func.__name__))
 
     return __impl__
 
@@ -418,7 +418,7 @@ def cuda_places(device_ids=None):
     [paddle.CUDAPlace(0), paddle.CUDAPlace(1), paddle.CUDAPlace(2)].
 
     Parameters:
-        device_ids (list or tuple of int, optional): list of GPU device ids.
+        device_ids (list|tuple, optional): A list/tuple of int of GPU device ids.
 
     Returns:
         list of paddle.CUDAPlace: Created GPU place list.
@@ -429,6 +429,8 @@ def cuda_places(device_ids=None):
             import paddle
             import paddle.static as static
 
+            # required: gpu
+            
             paddle.enable_static()
 
             cuda_places = static.cuda_places()
@@ -1304,6 +1306,10 @@ def clear_gradient(self):
         """
         pass
 
+    @fake_interface_only
+    def register_hook(self, hook):
+        pass
+
     def __str__(self):
         return self._to_readable_code()
 
@@ -2254,7 +2260,8 @@ class Operator(object):
         'gen_bkcl_id', 'c_gen_bkcl_id', 'gen_nccl_id', 'c_gen_nccl_id',
         'c_comm_init', 'c_sync_calc_stream', 'c_sync_comm_stream',
         'queue_generator', 'dequeue', 'enqueue', 'heter_listen_and_serv',
-        'c_wait_comm', 'c_wait_compute'
+        'c_wait_comm', 'c_wait_compute', 'c_gen_hccl_id', 'c_comm_init_hccl',
+        'copy_cross_scope'
     }
 
     def __init__(self,
@@ -5020,6 +5027,9 @@ def _inference_optimize(self, prune_read_op=True):
                 op = block.op(j)
                 if op.has_attr('is_test'):
                     op._set_attr('is_test', True)
+                if op.type() == "batch_norm":
+                    # Remove the output ReserveSpace of batch_norm if exists.
+                    op.remove_output("ReserveSpace")
         res.blocks = [
             Block(res, i) for i in six.moves.range(res.desc.num_blocks())
         ]
@@ -6073,7 +6083,8 @@ def device_guard(device=None):
     A context manager that specifies the device on which the OP will be placed.
 
     Args:
-        device(str|None): Specify the device to use in the context. It should be 'cpu' or 'gpu',
+        device(str|None): Specify the device to use in the context. It should be ``cpu``,
+            ``gpu`` or ``gpu:x``, where ``x`` is the index of the GPUs. 
             When it is set to 'cpu' or 'gpu', all OPs created in the context will be
             placed on CPUPlace or CUDAPlace. When 'gpu' is set and the program runs on
             single-card, the device index will be the same as the device on which the
@@ -6113,9 +6124,9 @@ def device_guard(device=None):
         device, index = device.split(':')
         if device == 'cpu':
             raise ValueError("Should not set device id for cpu.")
-    if device not in ['cpu', 'gpu', '', None]:
+    if device not in ['cpu', 'gpu', 'npu', '', None]:
         raise ValueError(
-            "The Attr(device) should be 'cpu' or 'gpu', and it can also be empty string or None "
+            "The Attr(device) should be 'cpu' 'npu' or 'gpu', and it can also be empty string or None "
             "when there is no need to specify device. But received %s" % device)
     if index:
         device = ":".join([device, index])
diff --git a/python/paddle/fluid/incubate/data_generator/__init__.py b/python/paddle/fluid/incubate/data_generator/__init__.py
index 8d31a68e8083d6..7ff80039ae2e4a 100644
--- a/python/paddle/fluid/incubate/data_generator/__init__.py
+++ b/python/paddle/fluid/incubate/data_generator/__init__.py
@@ -22,7 +22,7 @@ class DataGenerator(object):
     """
     DataGenerator is a general Base class for user to inherit
     A user who wants to define his/her own python processing logic
-    with paddle.fluid.dataset should inherit this class.
+    with paddle.fluid.dataset should inherit this class
     """
 
     def __init__(self):
@@ -43,17 +43,14 @@ def set_batch(self, batch_size):
         This is necessary only if a user wants to define generator_batch
         
         Example:
-
             .. code-block:: python
                 import paddle.fluid.incubate.data_generator as dg
                 class MyData(dg.DataGenerator):
-
                     def generate_sample(self, line):
                         def local_iter():
                             int_words = [int(x) for x in line.split()]
                             yield ("words", int_words)
                         return local_iter
-
                     def generate_batch(self, samples):
                         def local_iter():
                             for s in samples:
@@ -68,17 +65,14 @@ def run_from_memory(self):
         '''
         This function generator data from memory, it is usually used for
         debug and benchmarking
-
         Example:
             .. code-block:: python
                 import paddle.fluid.incubate.data_generator as dg
                 class MyData(dg.DataGenerator):
-
                     def generate_sample(self, line):
                         def local_iter():
                             yield ("words", [1, 2, 3, 4])
                         return local_iter
-
                 mydata = MyData()
                 mydata.run_from_memory()
         '''
@@ -105,22 +99,18 @@ def run_from_stdin(self):
         process function with the _gen_str function. The parsed data will
         be wrote to stdout and the corresponding protofile will be
         generated.
-
         Example:
         
             .. code-block:: python
                 import paddle.fluid.incubate.data_generator as dg
                 class MyData(dg.DataGenerator):
-
                     def generate_sample(self, line):
                         def local_iter():
                             int_words = [int(x) for x in line.split()]
                             yield ("words", [int_words])
                         return local_iter
-
                 mydata = MyData()
                 mydata.run_from_stdin()
-
         '''
         batch_samples = []
         for line in sys.stdin:
@@ -144,10 +134,8 @@ def _gen_str(self, line):
         Further processing the output of the process() function rewritten by
         user, outputting data that can be directly read by the datafeed,and
         updating proto_info information.
-
         Args:
             line(str): the output of the process() function rewritten by user.
-
         Returns:
             Return a string data that can be read directly by the datafeed.
         '''
@@ -158,10 +146,8 @@ def generate_sample(self, line):
         '''
         This function needs to be overridden by the user to process the 
         original data row into a list or tuple.
-
         Args:
             line(str): the original data row
-
         Returns:
             Returns the data processed by the user.
               The data format is list or tuple: 
@@ -171,24 +157,19 @@ def generate_sample(self, line):
             For example:
             [("words", [1926, 08, 17]), ("label", [1])]
               or (("words", [1926, 08, 17]), ("label", [1]))
-
         Note:
             The type of feasigns must be in int or float. Once the float
             element appears in the feasign, the type of that slot will be
             processed into a float.
-
         Example:
-
             .. code-block:: python
                 import paddle.fluid.incubate.data_generator as dg
                 class MyData(dg.DataGenerator):
-
                     def generate_sample(self, line):
                         def local_iter():
                             int_words = [int(x) for x in line.split()]
                             yield ("words", [int_words])
                         return local_iter
-
         '''
         raise NotImplementedError(
             "Please rewrite this function to return a list or tuple: " +
@@ -201,25 +182,19 @@ def generate_batch(self, samples):
         It is usually used as batch processing when a user wants to
         do preprocessing on a batch of samples, e.g. padding according to
         the max length of a sample in the batch
-
         Args:
             samples(list tuple): generated sample from generate_sample
-
         Returns:
             a python generator, the same format as return value of generate_sample
-
         Example:
-
             .. code-block:: python
                 import paddle.fluid.incubate.data_generator as dg
                 class MyData(dg.DataGenerator):
-
                     def generate_sample(self, line):
                         def local_iter():
                             int_words = [int(x) for x in line.split()]
                             yield ("words", int_words)
                         return local_iter
-
                     def generate_batch(self, samples):
                         def local_iter():
                             for s in samples:
@@ -244,22 +219,18 @@ def _gen_str(self, line):
         Further processing the output of the process() function rewritten by
         user, outputting data that can be directly read by the MultiSlotDataFeed,
         and updating proto_info information.
-
         The input line will be in this format:
             >>> [(name, [str(feasign), ...]), ...]
             >>> or ((name, [str(feasign), ...]), ...)
         The output will be in this format:
             >>> [ids_num id1 id2 ...] ...
-
         For example, if the input is like this:
             >>> [("words", ["1926", "08", "17"]), ("label", ["1"])]
             >>> or (("words", ["1926", "08", "17"]), ("label", ["1"]))
         the output will be:
             >>> 3 1234 2345 3456 1 1
-
         Args:
             line(str): the output of the process() function rewritten by user.
-
         Returns:
             Return a string data that can be read directly by the MultiSlotDataFeed.
         '''
@@ -285,7 +256,6 @@ def _gen_str(self, line):
         Further processing the output of the process() function rewritten by
         user, outputting data that can be directly read by the MultiSlotDataFeed,
         and updating proto_info information.
-
         The input line will be in this format:
             >>> [(name, [feasign, ...]), ...] 
             >>> or ((name, [feasign, ...]), ...)
@@ -301,10 +271,8 @@ def _gen_str(self, line):
             >>> 3 1234 2345 3456 1 1
         the proto_info will be:
             >>> [("words", "uint64"), ("label", "uint64")]
-
         Args:
             line(str): the output of the process() function rewritten by user.
-
         Returns:
             Return a string data that can be read directly by the MultiSlotDataFeed.
         '''
diff --git a/python/paddle/fluid/incubate/data_generator/test_data_generator.py b/python/paddle/fluid/incubate/data_generator/test_data_generator.py
deleted file mode 100644
index dcacd67e92a888..00000000000000
--- a/python/paddle/fluid/incubate/data_generator/test_data_generator.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-from __init__ import *
-
-
-class SyntheticData(MultiSlotDataGenerator):
-    def generate_sample(self, line):
-        def data_iter():
-            for i in range(10000):
-                yield ("words", [1, 2, 3, 4]), ("label", [0])
-
-        return data_iter
-
-
-class SyntheticStringData(MultiSlotStringDataGenerator):
-    def generate_sample(self, line):
-        def data_iter():
-            for i in range(10000):
-                yield ("words", ["1", "2", "3", "4"], ("label", ["0"]))
-
-
-sd = SyntheticData()
-sd.run_from_memory()
-
-sd2 = SyntheticStringData()
-sd.run_from_memory()
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
index 35029a3dfc7e70..2a9d26daaed901 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py
@@ -149,6 +149,7 @@ def __init__(self):
         if num_threads > 1:
             self._build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
         self.debug_opt = None
+        self.use_ps_gpu = False
 
     def set_debug_opt(self, opt_info):
         self.debug_opt = opt_info
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
index baf8add04caad0..b2735727f6755b 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
@@ -138,6 +138,7 @@ def __init__(self, main_program, startup_program, strategy, role_maker):
 
         self.strategy = strategy
         self.role_maker = role_maker
+        self.use_ps_gpu = False
         try:
             self.is_heter_ps_mode = role_maker._is_heter_parameter_server_mode
         except:
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 5f327497047470..d4af3e2f8042a5 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -527,7 +527,7 @@ def create_heter_program(program, config, heter_program, heter_ops,
     # This function mainly includes the following contents:
     # 1. For every heter block:
     #     a) copy heter device op from origin program
-    #     b) create variables which belong to heter op：
+    #     b) create variables which belong to heter op:
     #         -> if variable is persistable, clone it in global_scope
     #         -> if variable is temp, create it in heter block
     #     c) create communicate related op as follow:
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py
deleted file mode 100644
index a9fd8ac74f428e..00000000000000
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.contrib.utils import HDFSClient
-import os
-import time
-
-
-def check_all_trainers_ready(ready_path, epoch):
-    trainer_num = fleet.worker_num()
-    trainer_id = fleet.worker_index()
-
-    hadoop_home = os.getenv("HADOOP_HOME")
-    configs = {
-        "fs.default.name": os.getenv("FS_NAME"),
-        "hadoop.job.ugi": os.getenv("FS_UGI")
-    }
-
-    node_ready = "ready.{}.{}.done".format(epoch, trainer_id)
-
-    with open(node_ready, "w") as node:
-        node.write("")
-
-    client = HDFSClient(hadoop_home, configs)
-    if not client.is_dir(ready_path):
-        client.makedirs(ready_path)
-    client.upload(
-        hdfs_path=ready_path,
-        local_path=node_ready,
-        overwrite=True,
-        retry_times=0)
-
-    print("PUT {} ON HDFS {} OK".format(node_ready, ready_path))
-
-    while True:
-        ready_num = len(client.ls(ready_path))
-        print("have {} trainers need to be ready".format(trainer_num - ready_num
-                                                         % trainer_num))
-        if ready_num % trainer_num == 0:
-            break
-        time.sleep(10)
-        ready_num = len(client.ls(ready_path))
-
-    print("All trainers are ready, continue training")
diff --git a/python/paddle/fluid/incubate/fleet/utils/utils.py b/python/paddle/fluid/incubate/fleet/utils/utils.py
index 79f3fb9193440a..5cb4948a859d67 100644
--- a/python/paddle/fluid/incubate/fleet/utils/utils.py
+++ b/python/paddle/fluid/incubate/fleet/utils/utils.py
@@ -34,9 +34,12 @@
     "graphviz"
 ]
 
-logging.basicConfig(
-    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
 logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+formatter = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(message)s')
+ch = logging.StreamHandler()
+ch.setFormatter(formatter)
+logger.addHandler(ch)
 
 persistable_vars_out_fn = "vars_persistable.log"
 all_vars_out_fn = "vars_all.log"
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 86fab9811275fb..5b2010f3409580 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -152,8 +152,7 @@ def __call__(self, var, block=None):
             out_dtype = var.dtype
             out_var = var
 
-        # Initialization Ops should be prepended and not appended
-        op = block._prepend_op(
+        op = block.append_op(
             type="fill_constant",
             outputs={"Out": out_var},
             attrs={
@@ -238,10 +237,10 @@ def __call__(self, var, block=None):
         block = self._check_block(block)
 
         assert isinstance(block, framework.Block)
-        check_variable_and_dtype(var, "Out", ["float16", "float32", "float64"],
+        check_variable_and_dtype(var, "Out",
+                                 ["uint16", "float16", "float32", "float64"],
                                  "uniform_random")
 
-        # Initialization Ops should be prepended and not appended
         if self._seed == 0:
             self._seed = block.program.random_seed
 
@@ -259,7 +258,7 @@ def __call__(self, var, block=None):
             out_dtype = var.dtype
             out_var = var
 
-        op = block._prepend_op(
+        op = block.append_op(
             type="uniform_random",
             inputs={},
             outputs={"Out": out_var},
@@ -330,14 +329,15 @@ def __call__(self, var, block=None):
 
         assert isinstance(block, framework.Block)
 
-        check_variable_and_dtype(var, "Out", ["float16", "float32", "float64"],
+        check_variable_and_dtype(var, "Out",
+                                 ["uint16", "float16", "float32", "float64"],
                                  "guassian_random")
-        # Initialization Ops should be prepended and not appended
+
         if self._seed == 0:
             self._seed = block.program.random_seed
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -350,7 +350,7 @@ def __call__(self, var, block=None):
             out_dtype = var.dtype
             out_var = var
 
-        op = block._prepend_op(
+        op = block.append_op(
             type="gaussian_random",
             outputs={"Out": out_var},
             attrs={
@@ -363,7 +363,7 @@ def __call__(self, var, block=None):
             },
             stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -416,12 +416,12 @@ def __call__(self, var, block=None):
 
         assert isinstance(var, framework.Variable)
         assert isinstance(block, framework.Block)
-        # Initialization Ops should be prepended and not appended
+
         if self._seed == 0:
             self._seed = block.program.random_seed
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -434,7 +434,7 @@ def __call__(self, var, block=None):
             out_dtype = var.dtype
             out_var = var
 
-        op = block._prepend_op(
+        op = block.append_op(
             type="truncated_gaussian_random",
             outputs={"Out": out_var},
             attrs={
@@ -446,7 +446,7 @@ def __call__(self, var, block=None):
             },
             stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -526,7 +526,8 @@ def __call__(self, var, block=None):
         block = self._check_block(block)
 
         assert isinstance(block, framework.Block)
-        check_variable_and_dtype(var, "Out", ["float16", "float32", "float64"],
+        check_variable_and_dtype(var, "Out",
+                                 ["uint16", "float16", "float32", "float64"],
                                  "xavier_init")
 
         f_in, f_out = self._compute_fans(var)
@@ -539,7 +540,8 @@ def __call__(self, var, block=None):
             self._seed = block.program.random_seed
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype == VarDesc.VarType.FP16 or (
+                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -554,7 +556,7 @@ def __call__(self, var, block=None):
 
         if self._uniform:
             limit = np.sqrt(6.0 / float(fan_in + fan_out))
-            op = block._prepend_op(
+            op = block.append_op(
                 type="uniform_random",
                 inputs={},
                 outputs={"Out": out_var},
@@ -569,7 +571,7 @@ def __call__(self, var, block=None):
 
         else:
             std = np.sqrt(2.0 / float(fan_in + fan_out))
-            op = block._prepend_op(
+            op = block.append_op(
                 type="gaussian_random",
                 outputs={"Out": out_var},
                 attrs={
@@ -581,7 +583,8 @@ def __call__(self, var, block=None):
                 },
                 stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype == VarDesc.VarType.FP16 or (
+                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -670,7 +673,8 @@ def __call__(self, var, block=None):
             self._seed = block.program.random_seed
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype == VarDesc.VarType.FP16 or (
+                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -685,7 +689,7 @@ def __call__(self, var, block=None):
 
         if self._uniform:
             limit = np.sqrt(6.0 / float(fan_in))
-            op = block._prepend_op(
+            op = block.append_op(
                 type="uniform_random",
                 inputs={},
                 outputs={"Out": out_var},
@@ -700,7 +704,7 @@ def __call__(self, var, block=None):
 
         else:
             std = np.sqrt(2.0 / float(fan_in))
-            op = block._prepend_op(
+            op = block.append_op(
                 type="gaussian_random",
                 outputs={"Out": out_var},
                 attrs={
@@ -712,7 +716,8 @@ def __call__(self, var, block=None):
                 },
                 stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype == VarDesc.VarType.FP16 or (
+                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -812,7 +817,9 @@ def __call__(self, var, block=None):
         weight = np.reshape(weight, shape)
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16 or var.dtype == VarDesc.VarType.FP64:
+        if var.dtype in [
+                VarDesc.VarType.FP16, VarDesc.VarType.BF16, VarDesc.VarType.FP64
+        ]:
             out_dtype = VarDesc.VarType.FP32
             out_var = block.create_var(
                 name=unique_name.generate(".".join(
@@ -842,7 +849,9 @@ def __call__(self, var, block=None):
                 value_name: values
             })
 
-        if var.dtype == VarDesc.VarType.FP16 or var.dtype == VarDesc.VarType.FP64:
+        if var.dtype in [
+                VarDesc.VarType.FP16, VarDesc.VarType.BF16, VarDesc.VarType.FP64
+        ]:
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
@@ -898,7 +907,7 @@ def __call__(self, var, block=None):
         assert isinstance(block, framework.Block)
 
         # to be compatible of fp16 initalizers
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             out_dtype = VarDesc.VarType.FP32
             np_value = self._value.astype("float32")
             out_var = block.create_var(
@@ -913,7 +922,6 @@ def __call__(self, var, block=None):
             out_dtype = var.dtype
             np_value = self._value
 
-        # Initialization Ops should be prepended and not appended
         if out_dtype == VarDesc.VarType.FP32:
             value_name = "fp32_values"
             values = [float(v) for v in np_value.flat]
@@ -925,7 +933,7 @@ def __call__(self, var, block=None):
         if self._value.size > 1024 * 1024 * 1024:
             raise ValueError("The size of input is too big. Please consider "
                              "saving it to file and 'load_op' to load it")
-        op = block._prepend_op(
+        op = block.append_op(
             type='assign_value',
             outputs={'Out': out_var},
             attrs={
@@ -935,7 +943,7 @@ def __call__(self, var, block=None):
             },
             stop_gradient=True)
 
-        if var.dtype == VarDesc.VarType.FP16:
+        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
             block.append_op(
                 type="cast",
                 inputs={"X": out_var},
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index cfb4b125993855..30baa2aa26cda3 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1913,7 +1913,7 @@ def load(program, model_path, executor=None, var_list=None):
         model_path(str): The file prefix store the program
         executor(Executor, optional): The executor used for initialize the parameter
                                       When startup program is not run.
-        var_list(list, optional): The Tensor list to load single model file saved with
+        var_list(list|tuple, optional): The Tensor list/tuple to load single model file saved with
                                   [ save_params, save_persistables, save_vars ].
                                   Default: None
 
@@ -2041,6 +2041,10 @@ def set_var(var, ndarray):
             p = paddle.fluid.core.Place()
             p.set_place(t._place())
             place = paddle.fluid.XPUPlace(p.xpu_device_id())
+        elif p.is_npu_place():
+            p = paddle.fluid.core.Place()
+            p.set_place(t._place())
+            place = paddle.fluid.NPUPlace(p.npu_device_id())
         else:
             p = paddle.fluid.core.Place()
             p.set_place(t._place())
@@ -2099,7 +2103,7 @@ def load_program_state(model_path, var_list=None):
 
     Args:
         model_path(str): The file prefix store the program
-        var_list(list, optional): The Tensor list to load saved with
+        var_list(list|tuple, optional): The Tensor list/tuple to load saved with
                                   [ save_params, save_persistables, save_vars ].
                                   Default: None.
                                   The var_list is only used to get name,
@@ -2335,6 +2339,10 @@ def set_program_state(program, state_dict):
                 p = paddle.fluid.core.Place()
                 p.set_place(ten_place)
                 py_place = paddle.fluid.XPUPlace(p.xpu_device_id())
+            elif ten_place.is_npu_place():
+                p = paddle.fluid.core.Place()
+                p.set_place(ten_place)
+                py_place = paddle.fluid.NPUPlace(p.npu_device_id())
 
             ten.set(new_para_np, py_place)
 
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 5ee46a68fb76e1..e9738b6660eeaf 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -331,12 +331,14 @@ def create_parameter(self,
             if isinstance(dtype, core.VarDesc.VarType):
                 if dtype != core.VarDesc.VarType.FP32 and \
                         dtype != core.VarDesc.VarType.FP64 and \
-                        dtype != core.VarDesc.VarType.FP16:
+                        dtype != core.VarDesc.VarType.FP16 and \
+                        dtype != core.VarDesc.VarType.BF16:
                     raise TypeError(
                         "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
                     )
             else:
-                if not (dtype.startswith("float") or dtype == "double"):
+                if not (dtype.startswith("float") or
+                        dtype in ["double", "uint16"]):
                     raise TypeError(
                         "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
                     )
@@ -379,7 +381,10 @@ def create_parameter(self,
             return self.main_program.global_block().create_parameter(
                 dtype=dtype, shape=shape, type=type, **attr._to_kwargs())
 
-    def create_variable_for_type_inference(self, dtype, stop_gradient=False):
+    def create_variable_for_type_inference(self,
+                                           dtype,
+                                           stop_gradient=False,
+                                           shape=None):
         """Create a temporary variable that should be type inferred layer.
 
         Note:
@@ -395,6 +400,7 @@ def create_variable_for_type_inference(self, dtype, stop_gradient=False):
             name=unique_name.generate_with_ignorable_key(".".join(
                 [self.name, 'tmp'])),
             dtype=dtype,
+            shape=shape,
             type=core.VarDesc.VarType.LOD_TENSOR,
             persistable=False,
             stop_gradient=stop_gradient)
diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py
index 708692c215fb07..6e52ea04a195a4 100755
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -25,7 +25,8 @@
 from ..data_feeder import check_variable_and_dtype
 
 __all__ = [
-    'generate_layer_fn', 'generate_activation_fn', 'autodoc', 'templatedoc'
+    'generate_layer_fn', 'generate_activation_fn', 'generate_inplace_fn',
+    'autodoc', 'templatedoc'
 ]
 
 
@@ -283,6 +284,35 @@ def func(x, name=None):
     return func
 
 
+def generate_inplace_fn(inplace_op_type):
+    """Register the Python layer for an Inplace Operator without Attribute.
+
+    Args:
+       inplace_op_type: The name of the inplace operator to be created.
+
+    This function takes in the inplace operator type (exp_ , ceil_ etc) and
+    creates the operator functionality.
+    """
+    origin_op_type = inplace_op_type[:-1]
+
+    def func(x, name=None):
+        if in_dygraph_mode():
+            op = getattr(core.ops, inplace_op_type)
+            return op(x)
+        warnings.warn(
+            "In static mode, {}() is the same as {}() and does not perform inplace operation.".
+            format(inplace_op_type, origin_op_type))
+        return generate_activation_fn(origin_op_type)(x, name)
+
+    func.__name__ = inplace_op_type
+    func.__doc__ = """
+Inplace version of ``{0}`` API, the output Tensor will be inplaced with input ``x``.
+Please refer to :ref:`api_fluid_layers_{1}`.
+""".format(origin_op_type, origin_op_type)
+
+    return func
+
+
 def autodoc(comment=""):
     def __impl__(func):
         func.__doc__ = _generate_doc_string_(OpProtoHolder.instance(
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index a68331b156b3bf..a2dee91dbef7c0 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -370,13 +370,7 @@ def __impl__(self, other_var):
             setattr(Variable, method_name, method_impl)
     else:
         import paddle.tensor
-        variabel_methods = paddle.tensor.linalg.__all__ + \
-                           paddle.tensor.math.__all__ + \
-                           paddle.tensor.logic.__all__ + \
-                           paddle.tensor.manipulation.__all__ + \
-                           paddle.tensor.search.__all__ + \
-                           paddle.tensor.stat.__all__ + \
-                           paddle.tensor.attribute.__all__
+        variabel_methods = paddle.tensor.tensor_method_func
         for method_name in variabel_methods:
             if hasattr(Variable, method_name): continue
             method_impl = getattr(paddle.tensor, method_name, None)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index e90af2a1e790c3..aa021c463bf3d7 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -332,7 +332,8 @@ def fc(input,
         for i, input_x in enumerate(input):
             check_type(input_x, 'input[' + str(i) + ']', Variable, 'fc')
     dtype = helper.input_dtype()
-    check_dtype(dtype, 'input', ['float16', 'float32', 'float64'], 'fc')
+    check_dtype(dtype, 'input', ['float16', 'uint16', 'float32', 'float64'],
+                'fc')
     mul_results = []
     for input_var, param_attr in helper.iter_inputs_and_params():
         input_shape = input_var.shape
@@ -491,7 +492,7 @@ def embedding(input,
     helper = LayerHelper('embedding', **locals())
     check_variable_and_dtype(input, 'input', ['int64'],
                              'fluid.layers.embedding')
-    check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
+    check_dtype(dtype, 'dtype', ['uint16', 'float16', 'float32', 'float64'],
                 'fluid.layers.embedding')
 
     if is_distributed:
@@ -10523,10 +10524,10 @@ def uniform_random_batch_size_like(input,
 
 
     """
-    check_variable_and_dtype(input, 'Input', ("float32", 'float64'),
+    check_variable_and_dtype(input, 'Input', ("float32", 'float64', "uint16"),
                              'uniform_random_batch_size_like')
     check_type(shape, 'shape', (list, tuple), 'uniform_random_batch_size_like')
-    check_dtype(dtype, 'dtype', ('float32', 'float64'),
+    check_dtype(dtype, 'dtype', ('float32', 'float64', "uint16"),
                 'uniform_random_batch_size_like')
 
     helper = LayerHelper('uniform_random_batch_size_like', **locals())
@@ -13028,7 +13029,10 @@ def grid_sampler(x, grid, name=None):
     out = helper.create_variable_for_type_inference(x.dtype)
     ipts = {'X': x, 'Grid': grid}
 
-    helper.append_op(type='grid_sampler', inputs=ipts, outputs={'Output': out})
+    attrs = {'use_cudnn': False} if core.is_compiled_with_rocm() else {}
+
+    helper.append_op(
+        type='grid_sampler', inputs=ipts, outputs={'Output': out}, attrs=attrs)
     return out
 
 
@@ -15120,7 +15124,8 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
                                        float(max), 'seed', seed, 'dtype', dtype)
 
     check_type(shape, 'shape', (list, tuple, Variable), 'uniform_random/rand')
-    check_dtype(dtype, 'dtype', ('float32', 'float64'), 'uniform_random/rand')
+    check_dtype(dtype, 'dtype', ('float32', 'float64', 'uint16'),
+                'uniform_random/rand')
 
     inputs = dict()
     attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 841daf7a41d1fa..813f671e020706 100755
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 import os
-from .layer_function_generator import generate_layer_fn, generate_activation_fn, add_sample_code
+from .layer_function_generator import generate_layer_fn, generate_activation_fn, generate_inplace_fn, add_sample_code
 from .. import core
 from ..framework import convert_np_dtype_to_dtype_, Variable
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
@@ -27,6 +27,7 @@
 
 __activations_noattr__ = [
     'sigmoid',
+    'silu',
     'logsigmoid',
     'tanh_shrink',
     'softplus',
@@ -54,6 +55,16 @@
     'square',
 ]
 
+__inplace_unary_func__ = [
+    'exp_',
+    'sqrt_',
+    'rsqrt_',
+    'ceil_',
+    'floor_',
+    'round_',
+    'reciprocal_',
+]
+
 __all__ = []
 
 for _OP in set(__all__):
@@ -68,6 +79,7 @@
 
 __all__ += __activations_noattr__
 __all__ += __unary_func__
+__all__ += __inplace_unary_func__
 
 for _OP in set(__activations_noattr__):
     _new_OP = _OP
@@ -86,6 +98,14 @@
     func = deprecated(since="2.0.0", update_to="paddle.%s" % (_new_OP))(func)
     globals()[_OP] = func
 
+for _OP in set(__inplace_unary_func__):
+    _new_OP = _OP
+    if _OP in __deprecated_func_name__:
+        _new_OP = __deprecated_func_name__[_OP]
+    func = generate_inplace_fn(_OP)
+    func = deprecated(since="2.0.0", update_to="paddle.%s" % (_new_OP))(func)
+    globals()[_OP] = func
+
 add_sample_code(globals()["sigmoid"], r"""
 Examples:
     .. code-block:: python
@@ -100,6 +120,20 @@
 
 """)
 
+add_sample_code(globals()["silu"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+        import paddle.nn.functional as F
+
+        x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+        out = F.silu(x)
+        print(out)
+        # [ 0.7310586 1.7615942 2.8577224, 3.9280552 ]
+
+""")
+
 add_sample_code(globals()["logsigmoid"], r"""
 Examples:
     .. code-block:: python
diff --git a/python/paddle/fluid/layers/sequence_lod.py b/python/paddle/fluid/layers/sequence_lod.py
index df1113660f7d8d..a42ec2c92a3aad 100644
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
@@ -139,10 +139,11 @@ def sequence_conv(input,
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
+             import paddle
+             paddle.enable_static()
 
-             x = fluid.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
-             x_conved = fluid.layers.sequence_conv(input=x, num_filters=2, filter_size=3, padding_start=-1)
+             x = paddle.static.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
+             x_conved = paddle.static.nn.sequence_conv(input=x, num_filters=2, filter_size=3, padding_start=-1)
     """
 
     assert not in_dygraph_mode(), (
@@ -233,15 +234,17 @@ def sequence_softmax(input, use_cudnn=False, name=None):
     Examples:
 
         .. code-block:: python
-
-             import paddle.fluid as fluid
-             x = fluid.data(name='x', shape=[7, 1],
+             
+             import paddle
+             paddle.enable_static()
+             
+             x = paddle.static.data(name='x', shape=[7, 1],
                               dtype='float32', lod_level=1)
-             x_sequence_softmax_1 = fluid.layers.sequence_softmax(input=x)  
+             x_sequence_softmax_1 = paddle.static.nn.sequence_softmax(input=x)  
 
-             y = fluid.data(name='y', shape=[7],
+             y = paddle.static.data(name='y', shape=[7],
                  dtype='float32', lod_level=1)
-             x_sequence_softmax_2 = fluid.layers.sequence_softmax(input=y)  
+             x_sequence_softmax_2 = paddle.static.nn.sequence_softmax(input=y)  
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -334,15 +337,16 @@ def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
 
-            x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-            avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
-            sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
-            sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
-            max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
-            last_x = fluid.layers.sequence_pool(input=x, pool_type='last')
-            first_x = fluid.layers.sequence_pool(input=x, pool_type='first')
+            x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+            avg_x = paddle.static.nn.sequence_pool(input=x, pool_type='average')
+            sum_x = paddle.static.nn.sequence_pool(input=x, pool_type='sum')
+            sqrt_x = paddle.static.nn.sequence_pool(input=x, pool_type='sqrt')
+            max_x = paddle.static.nn.sequence_pool(input=x, pool_type='max')
+            last_x = paddle.static.nn.sequence_pool(input=x, pool_type='last')
+            first_x = paddle.static.nn.sequence_pool(input=x, pool_type='first')
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -413,10 +417,12 @@ def sequence_concat(input, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
-            y = fluid.data(name='y', shape=[-1, 10], dtype='float32', lod_level=1)
-            out = fluid.layers.sequence_concat(input=[x, y])
+            import paddle
+            paddle.enable_static()
+
+            x = paddle.static.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
+            y = paddle.static.data(name='y', shape=[-1, 10], dtype='float32', lod_level=1)
+            out = paddle.static.nn.sequence_concat(input=[x, y])
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -481,9 +487,11 @@ def sequence_first_step(input):
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
-             x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-             x_first_step = fluid.layers.sequence_first_step(input=x)
+             import paddle
+             paddle.enable_static()
+
+             x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+             x_first_step = paddle.static.nn.sequence_first_step(input=x)
     """
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                              'sequence_first_step')
@@ -538,9 +546,11 @@ def sequence_last_step(input):
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
-             x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-             x_last_step = fluid.layers.sequence_last_step(input=x)
+             import paddle
+             paddle.enable_static()
+             
+             x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+             x_last_step = paddle.static.nn.sequence_last_step(input=x)
     """
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                              'sequence_last_step')
@@ -598,13 +608,15 @@ def sequence_slice(input, offset, length, name=None):
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
+             import paddle
+             paddle.enable_static()
+             
              import numpy as np
-             seqs = fluid.data(name='x', shape=[10, 5],
+             seqs = paddle.static.data(name='x', shape=[10, 5],
                               dtype='float32', lod_level=1)
-             offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32"))
-             length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32"))
-             subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
+             offset = paddle.assign(np.array([[0, 1]]).astype("int32"))
+             length = paddle.assign(np.array([[2, 1]]).astype("int32"))
+             subseqs = paddle.static.nn.sequence_slice(input=seqs, offset=offset,
                                                    length=length)
     """
     assert not in_dygraph_mode(), (
@@ -715,17 +727,18 @@ def sequence_expand(x, y, ref_level=-1, name=None):
     Examples:
         .. code-block:: python
 	
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
+            import paddle
+            from paddle import fluid
+            paddle.enable_static()
             import numpy as np
 
-            x = fluid.data(name='x', shape=[4, 1], dtype='float32')
-            y = fluid.data(name='y', shape=[8, 1],
+            x = paddle.static.data(name='x', shape=[4, 1], dtype='float32')
+            y = paddle.static.data(name='y', shape=[8, 1],
                         dtype='float32', lod_level=1)
-            out = layers.sequence_expand(x=x, y=y, ref_level=0)
+            out = paddle.static.nn.sequence_expand(x=x, y=y, ref_level=0)
 
-            exe = fluid.Executor(fluid.CPUPlace())
-            place = fluid.CPUPlace()
+            exe = paddle.static.Executor(fluid.CPUPlace())
+            place = paddle.CPUPlace()
 
             np_data = np.array([[1], [2], [3], [4]]).astype('float32')
             x_lod_tensor = fluid.create_lod_tensor(np_data, [[2, 2]], place)
@@ -836,13 +849,14 @@ def sequence_expand_as(x, y, name=None):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
+            paddle.enable_static()
             import numpy as np
 
-            x = fluid.data(name='x', shape=[4, 1], dtype='float32')
-            y = fluid.data(name='y', shape=[8, 1], dtype='float32', lod_level=1)
-            out = layers.sequence_expand_as(x=x, y=y)
+            x = paddle.static.data(name='x', shape=[4, 1], dtype='float32')
+            y = paddle.static.data(name='y', shape=[8, 1], dtype='float32', lod_level=1)
+            out = paddle.static.nn.sequence_expand_as(x=x, y=y)
 
             exe = fluid.Executor(fluid.CPUPlace())
             place = fluid.CPUPlace()
@@ -969,13 +983,15 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
     Examples:
         .. code-block:: python
 
+            import paddle
+            paddle.enable_static()
             import paddle.fluid as fluid
             import numpy
 
-            x = fluid.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
-            pad_value = fluid.layers.assign(
-                input=numpy.array([0.0], dtype=numpy.float32))
-            out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
+            x = paddle.static.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
+            pad_value = paddle.assign(
+                numpy.array([0.0], dtype=numpy.float32))
+            out = paddle.static.nn.sequence_pad(x=x, pad_value=pad_value)
     """
 
     assert not in_dygraph_mode(), (
@@ -1048,16 +1064,18 @@ def sequence_unpad(x, length, name=None):
     Examples:
         .. code-block:: python
 
+            import paddle
+            paddle.enable_static()
             import paddle.fluid as fluid
             import numpy
 
             # pad data
-            x = fluid.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
-            pad_value = fluid.layers.assign(input=numpy.array([0.0], dtype=numpy.float32))
-            pad_data, len = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
+            x = paddle.static.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
+            pad_value = paddle.assign(numpy.array([0.0], dtype=numpy.float32))
+            pad_data, len = paddle.static.nn.sequence_pad(x=x, pad_value=pad_value)
             
             # unpad data
-            unpad_data = fluid.layers.sequence_unpad(x=pad_data, length=len)
+            unpad_data = paddle.static.nn.sequence_unpad(x=pad_data, length=len)
     """
 
     assert not in_dygraph_mode(), (
@@ -1123,9 +1141,11 @@ def sequence_reshape(input, new_dim):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[None, 16], dtype='float32', lod_level=1)
-            x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=4)
+            import paddle
+            paddle.enable_static()
+
+            x = paddle.static.data(name='x', shape=[None, 16], dtype='float32', lod_level=1)
+            x_reshaped = paddle.static.nn.sequence_reshape(input=x, new_dim=4)
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -1200,12 +1220,13 @@ def sequence_scatter(input, index, updates, name=None):
 
         .. code-block:: python
 	
-            import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
 
-            input = fluid.data( name="x", shape=[None, 3, 6], dtype='float32' )
-            index = fluid.data( name='index', shape=[12, 1],  dtype='int64', lod_level=1)
-            updates = fluid.data( name='updates', shape=[12, 1], dtype='float32', lod_level=1)
-            output = fluid.layers.sequence_scatter(input, index, updates)
+            input = paddle.static.data(name="x", shape=[None, 3, 6], dtype='float32' )
+            index = paddle.static.data(name='index', shape=[12, 1],  dtype='int64', lod_level=1)
+            updates = paddle.static.data(name='updates', shape=[12, 1], dtype='float32', lod_level=1)
+            output = paddle.static.nn.sequence_scatter(input, index, updates)
 
     """
     assert not in_dygraph_mode(), (
@@ -1279,10 +1300,11 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            x = fluid.data(name='x', shape=[-1, 1], dtype='int32', lod_level=1)
-            out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
+            import paddle
+            paddle.enable_static()
+            
+            x = paddle.static.data(name='x', shape=[-1, 1], dtype='int32', lod_level=1)
+            out = paddle.static.nn.sequence_enumerate(input=x, win_size=3, pad_value=0)
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -1333,26 +1355,30 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
             Tensor or LodTensor with shape [d_1, d_2, ..., d_n].
         maxlen (int, optional): Maximum length of the sequence. If :code:`maxlen` \
                            is None, it would be replace with :math:`max(x)`.
-        dtype (np.dtype|core.VarDesc.VarType|str, optional): Data type of the output, \
+        dtype (np.dtype|paddle.dtype|str, optional): Data type of the output, \
              ``int64`` by default.
         name(str, optional): For detailed information, please refer \
             to :ref:`api_guide_Name`. Usually name is no need to set and \
             None by default.
 
-    Returns: The output sequence mask. Tensor or LodTensor with shape [d_1, d_2, ..., d_n, maxlen] \
-            and data type of :code:`dtype`. The data type should be float32, float64, int8, \
+    Returns: The output sequence mask. Tensor with shape [d_1, d_2, ..., d_n, maxlen] \
+            and data type of :code:`dtype`. The data type should be bool, float32, float64, int8, \
             int32 or int64.
 
-    Return Type: Variable
+    Return Type: Tensor
 
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
+            import paddle
+
+            lengths = paddle.to_tensor([10, 9, 8])
+            mask = paddle.nn.functional.sequence_mask(lengths)
 
-            x = fluid.data(name='x', shape=[10], dtype='float32', lod_level=1)
-            mask = layers.sequence_mask(x=x)
+            print(mask.numpy())
+            # [[1 1 1 1 1 1 1 1 1 1]
+            #  [1 1 1 1 1 1 1 1 1 0]
+            #  [1 1 1 1 1 1 1 1 0 0]]
 
     """
     helper = LayerHelper('sequence_mask', **locals())
@@ -1414,9 +1440,11 @@ def sequence_reverse(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-            x_reversed = fluid.layers.sequence_reverse(x)
+            import paddle
+            paddle.enable_static()
+
+            x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+            x_reversed = paddle.static.nn.sequence_reverse(x)
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 7458466b02fd4e..c0c07f593a3ed2 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import math
 import numpy
 import six
 import warnings
@@ -147,7 +148,7 @@ def create_global_var(shape,
     This function creates a new tensor variable with value in the global block(block 0).
 
     Parameters:
-        shape (list of int): Shape of the variable
+        shape (list[int]|tuple[int]): Shape of the variable
         value (float): The value of the variable. The new created
                       variable will be filled with it.
         dtype (str): Data type of the variable
@@ -230,13 +231,13 @@ def cast(x, dtype):
         out = core.ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
         return out
 
-    check_variable_and_dtype(
-        x, 'x',
-        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
-        'cast')
+    check_variable_and_dtype(x, 'x', [
+        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'uint8',
+        'uint16'
+    ], 'cast')
     check_dtype(dtype, 'dtype', [
         'bool', 'float16', 'float32', 'float64', 'int8', 'int32', 'int64',
-        'uint8'
+        'uint8', 'uint16'
     ], 'cast')
 
     helper = LayerHelper('cast', **locals())
@@ -546,8 +547,10 @@ def assign(input, output=None):
     The OP copies the :attr:`input` to the :attr:`output`.
 
     Parameters:
-        input (Tensor|numpy.ndarray): A tensor or numpy ndarray, its data type supports
-            float16, float32, float64, int32 and int64.
+        input (Tensor|numpy.ndarray|list|tuple|scalar): A tensor, numpy ndarray, tuple/list of scalar,
+            or scalar. Its data type supports float16, float32, float64, int32, int64, and bool.
+            Note: the float64 data will be converted to float32 because of current platform protobuf
+            data limitation.
         output (Tensor, optional): A tensor. If :attr:`output` is None, a new tensor will
             be created as :attr:`output`. Default: None.
 
@@ -569,14 +572,19 @@ def assign(input, output=None):
           result3 = paddle.assign(np.array([[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]], dtype='float32')) # result3 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
     """
     helper = LayerHelper('assign', **locals())
-    check_type(input, 'input', (Variable, numpy.ndarray), 'assign')
+    check_type(input, 'input', (Variable, numpy.ndarray, list, tuple, float,
+                                int, bool), 'assign')
     is_inplace = True if output is not None else False
 
+    if numpy.isscalar(input) and not isinstance(input, str):
+        input = numpy.array([input])
+    elif isinstance(input, (list, tuple)):
+        input = numpy.array(input)
+
     if isinstance(input, Variable):
-        check_dtype(
-            input.dtype, 'input',
-            ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
-            'assign', '(When the type of input in assign is Variable.)')
+        check_dtype(input.dtype, 'input', [
+            'float16', 'uint16', 'float32', 'float64', 'int32', 'int64', 'bool'
+        ], 'assign', '(When the type of input in assign is Variable.)')
         if output is None:
             output = helper.create_variable_for_type_inference(
                 dtype=input.dtype)
@@ -584,6 +592,14 @@ def assign(input, output=None):
             type='assign', inputs={'X': [input]}, outputs={'Out': [output]})
     elif isinstance(input, numpy.ndarray):
         dtype = convert_np_dtype_to_dtype_(input.dtype)
+        if dtype == VarDesc.VarType.FP64:
+            # Setting FP64 numpy data is not supported in Paddle, so we
+            # use FP32 here
+            warnings.warn(
+                "paddle.assign doesn't support float64 input now due "
+                "to current platform protobuf data limitation, we convert "
+                "it to float32")
+            dtype = VarDesc.VarType.FP32
         if dtype == VarDesc.VarType.BOOL:
             value_name = "bool_values"
             values = [bool(v) for v in input.flat]
@@ -1373,6 +1389,11 @@ def range(start, end, step, dtype, name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
+    out_shape = None
+    if not isinstance(start, Variable) and not isinstance(
+            end, Variable) and not isinstance(step, Variable):
+        out_shape = [int(math.ceil((end - start) / step))]
+
     if not isinstance(start, Variable):
         with device_guard("cpu"):
             start = fill_constant([1], dtype, start, force_cpu=True)
@@ -1397,7 +1418,7 @@ def range(start, end, step, dtype, name=None):
     check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'],
                 'range/arange')
     helper = LayerHelper('range', **locals())
-    out = helper.create_variable_for_type_inference(dtype)
+    out = helper.create_variable_for_type_inference(dtype, shape=out_shape)
     helper.append_op(
         type='range',
         inputs={'Start': start,
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index c47cce76f89849..e8f8bdd3f9add0 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -16,6 +16,7 @@
 import six
 from . import layers
 from .data_feeder import check_variable_and_dtype, convert_dtype
+from ..utils import deprecated
 
 __all__ = [
     "simple_img_conv_pool",
@@ -332,6 +333,7 @@ def sequence_conv_pool(input,
     return pool_out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.glu")
 def glu(input, dim=-1):
     r"""
 	:api_attr: Static Graph
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index e1122471982314..41b2843ea33e93 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -1890,7 +1890,8 @@ class AdamOptimizer(Optimizer):
         beta2 (float|Variable, optional): The exponential decay rate for the 2nd moment estimates.
             It should be a float number or a Variable with shape [1] and data type as float32.
             The default value is 0.999.
-        epsilon (float, optional): A small float value for numerical stability.
+        epsilon (float|Tensor, optional): A small float value for numerical stability.
+            It should be a float number or a Variable with shape [1] and data type as float32.
             The default value is 1e-08.
         parameter_list (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
@@ -1959,7 +1960,7 @@ class AdamOptimizer(Optimizer):
                 avg_cost = fluid.layers.mean(cost)
 
                 # define beta decay variable
-                def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate):
+                def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate, epsilon_init):
                     global_step = lr_scheduler._decay_step_counter()
 
                     beta1 = fluid.layers.create_global_var(
@@ -1976,6 +1977,13 @@ def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate):
                         # set persistable for save checkpoints and resume
                         persistable=True,
                         name="beta2")
+                    epsilon = fluid.layers.create_global_var(
+                        shape=[1],
+                        value=float(epsilon_init),
+                        dtype='float32',
+                        # set persistable for save checkpoints and resume
+                        persistable=True,
+                        name="epsilon")
 
                     div_res = global_step / decay_steps
                     decayed_beta1 = beta1_init * (decay_rate**div_res)
@@ -1983,13 +1991,14 @@ def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate):
                     fluid.layers.assign(decayed_beta1, beta1)
                     fluid.layers.assign(decayed_beta2, beta2)
 
-                    return beta1, beta2
+                    return beta1, beta2, epsilon
 
-                beta1, beta2 = get_decayed_betas(0.9, 0.99, 1e5, 0.9)
+                beta1, beta2, epsilon = get_decayed_betas(0.9, 0.99, 1e5, 0.9, 1e-8)
                 adam_optimizer = fluid.optimizer.AdamOptimizer(
                                                     learning_rate=0.01,
                                                     beta1=beta1,
-                                                    beta2=beta2)
+                                                    beta2=beta2,
+                                                    epsilon=epsilon)
                 adam_optimizer.minimize(avg_cost)
 
                 fetch_list = [avg_cost]
@@ -2099,7 +2108,6 @@ def _append_optimize_op(self, block, param_and_grad):
             "Beta2PowOut": [beta2_pow_acc],
         }
         attrs = {
-            "epsilon": self._epsilon,
             "lazy_mode": self._lazy_mode,
             "min_row_size_to_use_multithread": 1000
         }
@@ -2112,6 +2120,10 @@ def _append_optimize_op(self, block, param_and_grad):
             inputs['Beta2Tensor'] = self._beta2
         else:
             attrs['beta2'] = self._beta2
+        if isinstance(self._epsilon, Variable):
+            inputs['EpsilonTensor'] = self._epsilon
+        else:
+            attrs['epsilon'] = self._epsilon
 
         adam_op = block.append_op(
             type=self.type,
@@ -4104,7 +4116,7 @@ def _get_op_device_attr(self, op):
         device = op.attr(self._op_device_key) \
             if op.has_attr(self._op_device_key) else None
         if device:
-            assert device[0:3] == 'gpu', "Now, only gpu devices are " \
+            assert device[0:3] == 'gpu' or dev_type == 'npu', "Now, only gpu and npu devices are " \
                 "supported in pipeline parallemism."
         return device
 
@@ -4592,13 +4604,13 @@ def _add_sub_blocks(self, main_block, program_list):
                 origin_sub_block_id = op.attr('sub_block').id
                 origin_sub_block = main_program.block(origin_sub_block_id)
                 new_sub_block = prog._create_block(parent_idx=0)
-                for op in origin_sub_block.ops:
-                    op_desc = op.desc
+                for sub_op in origin_sub_block.ops:
+                    op_desc = sub_op.desc
                     ap_op = new_sub_block.desc.append_op()
                     ap_op.copy_from(op_desc)
                 new_sub_block._sync_with_cpp()
                 self._create_vars(new_sub_block, origin_sub_block)
-                op._set_attr('sub_block:', new_sub_block)
+                op._set_attr('sub_block', new_sub_block)
 
     def _get_device_info(self, block):
         for op in block.ops:
@@ -4818,7 +4830,10 @@ def device_cmp(device1, device2):
         place_list = []
         for dev in device_list:
             dev_index = int(dev.split(":")[1])
-            place_list.append(core.CUDAPlace(0))
+            if core.is_compiled_with_cuda():
+                place_list.append(core.CUDAPlace(dev_index % 1))
+            elif core.is_compiled_with_npu():
+                place_list.append(core.NPUPlace(dev_index % 1))
 
         # Step6: Split startup program
         new_startup_program = self._split_startup_program(startup_program,
@@ -4837,7 +4852,10 @@ def device_cmp(device1, device2):
             self._accumulate_gradients(real_block)
             real_block._sync_with_cpp()
 
-        place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        if core.is_compiled_with_cuda():
+            place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        elif core.is_compiled_with_npu():
+            place_id = int(os.getenv("FLAGS_selected_npus", "0"))
         main_program._pipeline_opt = {
             "trainer": "PipelineTrainer",
             "device_worker": "Section",
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index bc7a60af94617c..40b0862be0177e 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -106,6 +106,65 @@ def cuda_profiler(output_file, output_mode=None, config=None):
         os.remove(config_file)
 
 
+@signature_safe_contextmanager
+def npu_profiler(output_file, config=None):
+    """
+    The NPU profiler.
+    
+    This fuctions is used to profile NPU program by NPU runtime application
+    programming interface. The profiling result will be written into
+    `output_file`. The users can set set the NPU profiling config by `config` argument. 
+    
+    After getting the profiling result file, users can use 
+    `tools provided by Ascend <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ 
+    to load this output file to visualize results.
+
+    Args:
+        output_file (str) : The output file name, the result will be
+            written into this file. It should be absolute path. 
+        config (list<str>, optional) : NPU profile config. For more details, please
+            refer to `User Guide <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ .
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            import paddle.fluid.profiler as profiler
+            import numpy as np
+
+            epoc = 8
+            dshape = [4, 3, 28, 28]
+            data = fluid.data(name='data', shape=[None, 3, 28, 28], dtype='float32')
+            conv = fluid.layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+            place = fluid.NPUPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+
+            output_file = 'npu.txt'
+            with profiler.npu_profiler(output_file) as npu_prof:
+                for i in range(epoc):
+                    input = np.random.random(dshape).astype('float32')
+                    exe.run(fluid.default_main_program(), feed={'data': input})
+            # then use  NPU profiler tools to load this output file
+            # to visualize results.
+    """
+    # TODO: support config in python.
+    if not config:
+        config = core.npu_prof_create_config()
+
+    core.npu_prof_init(output_file)
+    # Enables profiler collection by the active NPU profiling tool.
+    core.npu_prof_start(config)
+    try:
+        yield
+    # Disables profiler collection.
+    finally:
+        core.npu_prof_stop(config)
+        core.npu_prof_finalize()
+
+
 def reset_profiler():
     """
     Clear the previous time record. This interface does not work for
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 5e0e5f724a889a..64ce283a63c5bf 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -28,10 +28,12 @@ def _create_regularization_of_grad(param, grad, regularization=None):
     Function helper of append_regularization_ops.
     """
     # If no gradient or no regularization is specified,  then we don't need to do anything
-    if grad is None or (param.regularizer is None and regularization is None):
+    if grad is None or ((not hasattr(param, 'regularizer') or (
+            hasattr(param, 'regularizer') and param.regularizer is None)) and
+                        regularization is None):
         return grad
     regularization_term = None
-    if param.regularizer is not None:
+    if hasattr(param, 'regularizer') and param.regularizer is not None:
         # Add variable for regularization term in grad block
         regularization_term = param.regularizer(param, grad, grad.block)
     elif regularization is not None:
@@ -213,7 +215,7 @@ def __call__(self, param, grad, block):
         Returns:
             new variable for weight decay
         """
-        assert isinstance(param, framework.Parameter)
+        assert isinstance(param, framework.Variable)
         assert isinstance(block, framework.Block)
 
         inputs = {"X": [param]}
@@ -320,23 +322,25 @@ def __call__(self, param, grad, block):
         Returns:
             new variable for weight decay
         """
-        assert isinstance(param, framework.Parameter)
+        assert isinstance(param, framework.Variable)
         assert isinstance(block, framework.Block)
 
         if framework.in_dygraph_mode():
+            sign = block.create_var(dtype=param.dtype, shape=param.shape)
             decay = block.create_var(dtype=param.dtype, shape=param.shape)
         else:
+            sign = block.create_var(
+                dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)
             decay = block.create_var(
                 dtype=param.dtype, shape=param.shape, lod_level=param.lod_level)
 
         # Append sign op
-        block.append_op(
-            type='sign', inputs={"X": param}, outputs={"Out": decay})
+        block.append_op(type='sign', inputs={"X": param}, outputs={"Out": sign})
 
         # Append scale op to the output of sign op
         block.append_op(
             type='scale',
-            inputs={"X": decay},
+            inputs={"X": sign},
             outputs={"Out": decay},
             attrs={"scale": self._regularization_coeff})
 
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index df43d9366ff786..12952462270f00 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -16,6 +16,8 @@
 
 import paddle
 import paddle.fluid as fluid
+import paddle.static.amp as amp
+
 import contextlib
 import numpy
 import unittest
@@ -26,20 +28,36 @@
 paddle.enable_static()
 
 
-def train(use_cuda, save_dirname, is_local, use_bf16):
+def train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16):
     x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-
-    y_predict = fluid.layers.fc(input=x, size=1, act=None)
-
     y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
-    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-    avg_cost = fluid.layers.mean(cost)
+    if use_bf16:
+        if not pure_bf16:
+            with amp.bf16.bf16_guard():
+                y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = fluid.layers.mean(cost)
+        else:
+            y_predict = fluid.layers.fc(input=x, size=1, act=None)
+            with amp.bf16.bf16_guard():
+                cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+                avg_cost = fluid.layers.mean(cost)
+    else:
+        y_predict = fluid.layers.fc(input=x, size=1, act=None)
+        cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+        avg_cost = fluid.layers.mean(cost)
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+
     if use_bf16:
-        paddle.static.amp.rewrite_program_bf16(fluid.default_main_program())
-    sgd_optimizer.minimize(avg_cost)
+        sgd_optimizer = amp.bf16.decorate_bf16(
+            sgd_optimizer,
+            amp_lists=amp.bf16.AutoMixedPrecisionListsBF16(),
+            use_bf16_guard=False,
+            use_pure_bf16=pure_bf16)
+    sgd_optimizer.minimize(
+        avg_cost, startup_program=fluid.default_startup_program())
 
     BATCH_SIZE = 20
 
@@ -54,6 +72,10 @@ def train(use_cuda, save_dirname, is_local, use_bf16):
     def train_loop(main_program):
         feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
         exe.run(fluid.default_startup_program())
+        test_prog = main_program.clone(for_test=True)
+        if pure_bf16:
+            sgd_optimizer.amp_init(
+                exe.place, test_program=test_prog, use_bf16_test=True)
 
         PASS_NUM = 100
         for pass_id in range(PASS_NUM):
@@ -61,9 +83,8 @@ def train_loop(main_program):
                 avg_loss_value, = exe.run(main_program,
                                           feed=feeder.feed(data),
                                           fetch_list=[avg_cost])
-                print(avg_loss_value)
-                if avg_loss_value[0] < 10.0:
-                    if save_dirname is not None:
+                if avg_loss_value[0] < 10.0 or pure_bf16:
+                    if save_dirname is not None and not pure_bf16:
                         fluid.io.save_inference_model(save_dirname, ['x'],
                                                       [y_predict], exe)
                     return
@@ -97,7 +118,7 @@ def train_loop(main_program):
             train_loop(t.get_trainer_program())
 
 
-def infer(use_cuda, save_dirname=None):
+def infer(use_cuda, save_dirname=None, use_bf16=False):
     if save_dirname is None:
         return
 
@@ -135,7 +156,7 @@ def infer(use_cuda, save_dirname=None):
         print("ground truth: ", test_label)
 
 
-def main(use_cuda, is_local=True, use_bf16=False):
+def main(use_cuda, is_local=True, use_bf16=False, pure_bf16=False):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
@@ -145,11 +166,22 @@ def main(use_cuda, is_local=True, use_bf16=False):
     # Directory for saving the trained model
     save_dirname = "fit_a_line.inference.model"
 
-    train(use_cuda, save_dirname, is_local, use_bf16)
-    infer(use_cuda, save_dirname)
+    train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16)
+    infer(use_cuda, save_dirname, use_bf16)
+
+
+class TestFitALineBase(unittest.TestCase):
+    @contextlib.contextmanager
+    def program_scope_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
 
 
-class TestFitALine(unittest.TestCase):
+class TestFitALine(TestFitALineBase):
     def test_cpu(self):
         with self.program_scope_guard():
             main(use_cuda=False)
@@ -158,20 +190,17 @@ def test_cuda(self):
         with self.program_scope_guard():
             main(use_cuda=True)
 
-    @unittest.skipIf(not fluid.core.supports_bfloat16(),
-                     "place does not support BF16 evaluation")
+
+@unittest.skipIf(not fluid.core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestFitALineBF16(TestFitALineBase):
     def test_bf16(self):
         with self.program_scope_guard():
             main(use_cuda=False, use_bf16=True)
 
-    @contextlib.contextmanager
-    def program_scope_guard(self):
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        scope = fluid.core.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(prog, startup_prog):
-                yield
+    def test_pure_bf16(self):
+        with self.program_scope_guard():
+            main(use_cuda=False, use_bf16=True, pure_bf16=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/test_word2vec_book.py b/python/paddle/fluid/tests/book/test_word2vec_book.py
index ad7550fa9dd966..650ccc0776a50c 100644
--- a/python/paddle/fluid/tests/book/test_word2vec_book.py
+++ b/python/paddle/fluid/tests/book/test_word2vec_book.py
@@ -44,7 +44,8 @@ def train(target,
           is_parallel,
           save_dirname,
           is_local=True,
-          use_bf16=False):
+          use_bf16=False,
+          pure_bf16=False):
     PASS_NUM = 100
     EMBED_SIZE = 32
     HIDDEN_SIZE = 256
@@ -107,8 +108,14 @@ def __network__(words):
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
     if use_bf16:
-        paddle.static.amp.rewrite_program_bf16(fluid.default_main_program())
-    sgd_optimizer.minimize(avg_cost)
+        sgd_optimizer = paddle.static.amp.bf16.decorate_bf16(
+            sgd_optimizer,
+            amp_lists=paddle.static.amp.bf16.AutoMixedPrecisionListsBF16(
+                custom_fp32_list={'softmax', 'concat'}, ),
+            use_bf16_guard=False,
+            use_pure_bf16=pure_bf16)
+
+    sgd_optimizer.minimize(avg_cost, fluid.default_startup_program())
 
     train_reader = paddle.batch(
         paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
@@ -121,6 +128,8 @@ def __network__(words):
 
     def train_loop(main_program):
         exe.run(fluid.default_startup_program())
+        if pure_bf16:
+            sgd_optimizer.amp_init(exe.place)
 
         for pass_id in range(PASS_NUM):
             for data in train_reader():
@@ -128,7 +137,7 @@ def train_loop(main_program):
                                       feed=feeder.feed(data),
                                       fetch_list=[avg_cost])
                 if avg_cost_np[0] < 5.0:
-                    if save_dirname is not None:
+                    if save_dirname is not None and not pure_bf16:
                         fluid.io.save_inference_model(save_dirname, [
                             'firstw', 'secondw', 'thirdw', 'forthw'
                         ], [predict_word], exe)
@@ -246,7 +255,7 @@ def to_infer_tensor(lod_tensor):
             assert np.isclose(a, b, rtol=5e-5), "a: {}, b: {}".format(a, b)
 
 
-def main(target, is_sparse, is_parallel, use_bf16):
+def main(target, is_sparse, is_parallel, use_bf16, pure_bf16):
     if target == "cuda" and not fluid.core.is_compiled_with_cuda():
         return
     if target == "xpu" and not fluid.core.is_compiled_with_xpu():
@@ -265,7 +274,13 @@ def main(target, is_sparse, is_parallel, use_bf16):
         # so only inference is turned on.
         train("cpu", is_sparse, is_parallel, save_dirname)
     else:
-        train(target, is_sparse, is_parallel, save_dirname, use_bf16=use_bf16)
+        train(
+            target,
+            is_sparse,
+            is_parallel,
+            save_dirname,
+            use_bf16=use_bf16,
+            pure_bf16=pure_bf16)
     infer(target, save_dirname)
 
 
@@ -278,10 +293,15 @@ class W2VTest(unittest.TestCase):
     pass
 
 
-def inject_test_method(target, is_sparse, is_parallel, use_bf16=False):
+def inject_test_method(target,
+                       is_sparse,
+                       is_parallel,
+                       use_bf16=False,
+                       pure_bf16=False):
     fn_name = "test_{0}_{1}_{2}{3}".format(target, "sparse"
                                            if is_sparse else "dense", "parallel"
-                                           if is_parallel else "normal", "_bf16"
+                                           if is_parallel else "normal",
+                                           "_purebf16" if pure_bf16 else "_bf16"
                                            if use_bf16 else "")
 
     def __impl__(*args, **kwargs):
@@ -290,7 +310,7 @@ def __impl__(*args, **kwargs):
         scope = fluid.core.Scope()
         with fluid.scope_guard(scope):
             with fluid.program_guard(prog, startup_prog):
-                main(target, is_sparse, is_parallel, use_bf16)
+                main(target, is_sparse, is_parallel, use_bf16, pure_bf16)
 
     if (not fluid.core.is_compiled_with_cuda() or
             target == "cuda") and is_sparse:
@@ -307,7 +327,8 @@ def __impl__(*args, **kwargs):
     for is_sparse in (False, True):
         for is_parallel in (False, ):
             inject_test_method(target, is_sparse, is_parallel)
-inject_test_method("cpu", False, False, use_bf16=True)
+inject_test_method("cpu", False, False, True)
+inject_test_method("cpu", False, False, True, True)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
index 81f64038c7c900..2092151b84f454 100644
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -1,6 +1,5 @@
-# New custom OP can support Windows/Linux now
-if(WITH_GPU OR APPLE) 
-    # GPU custom op tests: compile both .cc and .cu file
+# New custom OP can support Windows/Linux/Mac now
+if(WITH_GPU OR APPLE)
     py_test(test_custom_relu_op_setup SRCS test_custom_relu_op_setup.py)
     py_test(test_custom_relu_op_jit SRCS test_custom_relu_op_jit.py)
     py_test(test_custom_relu_model SRCS test_custom_relu_model.py)
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
index 4ec7d0884582e7..38e8e71cf8129b 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -45,8 +45,12 @@ std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
   int grid = (numel + block - 1) / block;
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       x.type(), "relu_cuda_forward_kernel", ([&] {
+        auto cpu_input = x.copy_to<data_t>(paddle::PlaceType::kCPU);
+        auto gpu_input = cpu_input.copy_to<data_t>(paddle::PlaceType::kGPU);
         relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
-            x.data<data_t>(), out.mutable_data<data_t>(x.place()), numel);
+            gpu_input.data<data_t>(),
+            out.mutable_data<data_t>(x.place()),
+            numel);
       }));
 
   return {out};
diff --git a/python/paddle/fluid/tests/custom_op/test_check_abi.py b/python/paddle/fluid/tests/custom_op/test_check_abi.py
index 75cf99458e71ad..baef25d2d1162d 100644
--- a/python/paddle/fluid/tests/custom_op/test_check_abi.py
+++ b/python/paddle/fluid/tests/custom_op/test_check_abi.py
@@ -64,14 +64,29 @@ def test_wrong_compiler_warning(self):
         # clear environ
         self.del_environ()
         compiler = 'python'  # fake wrong compiler
-        with warnings.catch_warnings(record=True) as error:
-            flag = utils.check_abi_compatibility(compiler, verbose=True)
-            # check return False
-            self.assertFalse(flag)
-            # check Compiler Compatibility WARNING
-            self.assertTrue(len(error) == 1)
-            self.assertTrue(
-                "Compiler Compatibility WARNING" in str(error[0].message))
+        if not utils.IS_WINDOWS:
+            with warnings.catch_warnings(record=True) as error:
+                flag = utils.check_abi_compatibility(compiler, verbose=True)
+                # check return False
+                self.assertFalse(flag)
+                # check Compiler Compatibility WARNING
+                self.assertTrue(len(error) == 1)
+                self.assertTrue(
+                    "Compiler Compatibility WARNING" in str(error[0].message))
+
+    def test_exception_windows(self):
+        # clear environ
+        self.del_environ()
+        compiler = 'fake compiler'  # fake command
+        if utils.IS_WINDOWS:
+            with warnings.catch_warnings(record=True) as error:
+                flag = utils.check_abi_compatibility(compiler, verbose=True)
+                # check return False
+                self.assertFalse(flag)
+                # check ABI Compatibility WARNING
+                self.assertTrue(len(error) == 1)
+                self.assertTrue("Failed to check compiler version for" in
+                                str(error[0].message))
 
     def test_exception_linux(self):
         # clear environ
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_concat.py b/python/paddle/fluid/tests/custom_op/test_custom_concat.py
index ea41126c1c471d..d796c3b5fbd60b 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_concat.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_concat.py
@@ -58,7 +58,7 @@ def concat_dynamic(func, dtype, np_inputs, axis_v, with_attr=False):
     out = func(inputs, axis)
     out.stop_gradient = False
     out.backward()
-    grad_inputs = [x.grad for x in inputs]
+    grad_inputs = [x.grad.numpy() for x in inputs]
     return out.numpy(), grad_inputs
 
 
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_conj.py b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
index 3a8f79a06fc0bf..a8e4019880332d 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_conj.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_conj.py
@@ -63,7 +63,10 @@ def conj_dynamic(func, dtype, np_input):
         sum_out.real().backward()
     else:
         sum_out.backward()
-    return out.numpy(), x.grad
+    if x.grad is None:
+        return out.numpy(), x.grad
+    else:
+        return out.numpy(), x.grad.numpy()
 
 
 def conj_static(func, shape, dtype, np_input):
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
index db97e86385ae45..dddb14eb78e8a1 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
@@ -247,7 +247,8 @@ def train_model(self, device, use_custom_op=False, use_pe=False):
         paddle.set_device(device)
 
         with paddle.static.scope_guard(paddle.static.Scope()):
-            with paddle.static.program_guard(paddle.static.Program()):
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
                 x = paddle.static.data(
                     shape=[None, self.in_dim], name='x', dtype='float32')
                 y = paddle.static.data(
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
index d8dcc76ac60673..0f7ba84ffc147b 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py
@@ -105,12 +105,12 @@ def test_exception(self):
                 in str(e))
             if IS_WINDOWS:
                 self.assertTrue(
-                    r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc:47"
-                    in str(e))
+                    r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc" in
+                    str(e))
             else:
                 self.assertTrue(
-                    "python/paddle/fluid/tests/custom_op/custom_relu_op.cc:47"
-                    in str(e))
+                    "python/paddle/fluid/tests/custom_op/custom_relu_op.cc" in
+                    str(e))
         self.assertTrue(caught_exception)
 
         caught_exception = False
@@ -126,7 +126,7 @@ def test_exception(self):
                 "function \"relu_cuda_forward_kernel\" is not implemented for data type `int32_t`"
                 in str(e))
             self.assertTrue(
-                "python/paddle/fluid/tests/custom_op/custom_relu_op.cu:50" in
+                "python/paddle/fluid/tests/custom_op/custom_relu_op.cu" in
                 str(e))
         self.assertTrue(caught_exception)
 
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
index b267617451772f..0af0aa16466ea8 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -34,7 +34,10 @@ def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
 
     out.backward()
 
-    return out.numpy(), t.grad
+    if t.grad is None:
+        return out.numpy(), t.grad
+    else:
+        return out.numpy(), t.grad.numpy()
 
 
 def custom_relu_static(func,
@@ -142,7 +145,8 @@ def setUp(self):
             cmd = 'cd /d {} && python custom_relu_setup.py install'.format(
                 cur_dir)
         else:
-            cmd = 'cd {} && python custom_relu_setup.py install'.format(cur_dir)
+            cmd = 'cd {} && {} custom_relu_setup.py install'.format(
+                cur_dir, sys.executable)
         run_cmd(cmd)
 
         # NOTE(Aurelius84): Normally, it's no need to add following codes for users.
@@ -254,6 +258,35 @@ def test_static_save_and_load_inference_model(self):
                     format(predict, predict_infer))
         paddle.disable_static()
 
+    def test_static_save_and_run_inference_predictor(self):
+        paddle.enable_static()
+        np_data = np.random.random((1, 1, 28, 28)).astype("float32")
+        np_label = np.random.random((1, 1)).astype("int64")
+        path_prefix = "custom_op_inference/custom_relu"
+        from paddle.inference import Config
+        from paddle.inference import create_predictor
+        for device in self.devices:
+            predict = custom_relu_static_inference(
+                self.custom_ops[0], device, np_data, np_label, path_prefix)
+            # load inference model
+            config = Config(path_prefix + ".pdmodel",
+                            path_prefix + ".pdiparams")
+            predictor = create_predictor(config)
+            input_tensor = predictor.get_input_handle(predictor.get_input_names(
+            )[0])
+            input_tensor.reshape(np_data.shape)
+            input_tensor.copy_from_cpu(np_data.copy())
+            predictor.run()
+            output_tensor = predictor.get_output_handle(
+                predictor.get_output_names()[0])
+            predict_infer = output_tensor.copy_to_cpu()
+            self.assertTrue(
+                np.isclose(
+                    predict, predict_infer, rtol=5e-5).any(),
+                "custom op predict: {},\n custom op infer predict: {}".format(
+                    predict, predict_infer))
+        paddle.disable_static()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 486ad38ae296f9..8e998459cd4999 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -11,6 +11,7 @@ endif()
 string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
 list(APPEND DIST_TEST_OPS test_pipeline)
+list(APPEND DIST_TEST_OPS test_static_model_parallel)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
@@ -21,7 +22,8 @@ list(APPEND DIST_TEST_OPS test_gen_nccl_id_op)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_control_flow)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_dataparallel)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_hybrid_parallel)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_layer)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -71,10 +73,14 @@ endforeach()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_c_comm_init_all_op)
+    LIST(REMOVE_ITEM TEST_OPS test_c_concat)
+    LIST(REMOVE_ITEM TEST_OPS test_c_split)
     LIST(REMOVE_ITEM TEST_OPS test_allgather)
+    LIST(REMOVE_ITEM TEST_OPS test_c_identity)
     LIST(REMOVE_ITEM TEST_OPS test_allreduce)
     LIST(REMOVE_ITEM TEST_OPS test_broadcast)
     LIST(REMOVE_ITEM TEST_OPS test_collective_reduce)
+    LIST(REMOVE_ITEM TEST_OPS test_pipeline_parallel)
     LIST(REMOVE_ITEM TEST_OPS test_collective_scatter)
     LIST(REMOVE_ITEM TEST_OPS test_collective_sendrecv)
     LIST(REMOVE_ITEM TEST_OPS test_reducescatter)
@@ -90,6 +96,8 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_new_group_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_broadcast_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_allgather_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_alltoall_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_sendrecv_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_wait)
     LIST(REMOVE_ITEM TEST_OPS test_memcpy_op)
 endif()
@@ -154,6 +162,8 @@ if(APPLE OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_metric)
 endif()
 
+list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_hybrid_parallel)
+
 if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
     LIST(REMOVE_ITEM TEST_OPS test_rank_attention_op) # TODO(shenliang03): rank_attention_op support CPU device in future
@@ -167,9 +177,11 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_control_flow)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel)
-    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_hybrid_parallel)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_pipeline_layer)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single)
+    LIST(REMOVE_ITEM TEST_OPS test_dygraph_recompute)
 elseif(WITH_GPU)
     if (${CUDNN_VERSION} VERSION_LESS 7100)
         LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
@@ -627,6 +639,12 @@ if (WITH_XPU)
     add_subdirectory(xpu)
 endif()
 
+# dist xpu tests: 
+if (WITH_XPU_BKCL)
+    py_test(test_collective_reduce_api_xpu SRCS "test_collective_reduce_api.py")
+    py_test(test_collective_allreduce_api_xpu SRCS "test_collective_allreduce_api.py")
+endif()
+
 if (WITH_ASCEND_CL)
     add_subdirectory(npu)
 endif()
@@ -735,6 +753,7 @@ set_tests_properties(test_bicubic_interp_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_gather_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_static_save_load PROPERTIES TIMEOUT 250)
 set_tests_properties(test_pylayer_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_paddle_save_load_binary PROPERTIES TIMEOUT 120)
 if (WIN32)
     set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 900)
     set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 250)
@@ -808,7 +827,7 @@ set_tests_properties(test_imperative_optimizer PROPERTIES TIMEOUT 120)
 set_tests_properties(test_pool2d_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_transpose_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_eager_deletion_gru_net PROPERTIES TIMEOUT 120)
-set_tests_properties(test_activation_op PROPERTIES TIMEOUT 180)
+set_tests_properties(test_activation_op PROPERTIES TIMEOUT 270)
 set_tests_properties(test_normal PROPERTIES TIMEOUT 120)
 set_tests_properties(test_lstmp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120)
@@ -845,7 +864,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_dygraph_hybrid_parallel PROPERTIES TIMEOUT 120 LABELS "RUN_TYPE=DIST")
+    set_tests_properties(test_parallel_dygraph_pipeline_layer PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
     if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
         set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120)
         set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120)
@@ -853,19 +873,26 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
 endif()
 if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
     set_tests_properties(test_collective_allgather_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_alltoall_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_sendrecv_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_new_group_api PROPERTIES TIMEOUT 120)
     if(WITH_DISTRIBUTE)
         set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240)
     endif()
     set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_broadcast PROPERTIES TIMEOUT 120)
     set_tests_properties(test_reducescatter PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_reduce_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_pipeline_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_reduce PROPERTIES TIMEOUT 120)
     set_tests_properties(test_allreduce PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_c_concat PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_c_split PROPERTIES TIMEOUT 120)
     set_tests_properties(test_allgather PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_c_identity PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_scatter_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_barrier_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_scatter PROPERTIES TIMEOUT 120)
@@ -877,10 +904,12 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
         test_collective_scatter_api
         test_collective_barrier_api
         test_collective_reduce_api
+        test_pipeline_parallel
         test_collective_allreduce_api
         test_new_group_api
         test_collective_broadcast_api
         test_collective_allgather_api
+        test_collective_alltoall_api
         PROPERTIES LABELS "RUN_TYPE=DIST")
 endif()
 if(WITH_GPU OR WITH_ROCM)
diff --git a/python/paddle/fluid/tests/unittests/collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective_alltoall_api.py
new file mode 100644
index 00000000000000..be18b68a1da336
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_alltoall_api.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllToAllAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            tindata = paddle.split(tindata, 2, axis=0)
+            tout_data = []
+            paddle.distributed.alltoall(tindata, tout_data)
+            return tout_data
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllToAllAPI, "alltoall")
diff --git a/python/paddle/fluid/tests/unittests/collective_concat_op.py b/python/paddle/fluid/tests/unittests/collective_concat_op.py
new file mode 100644
index 00000000000000..c9de1713e7282e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_concat_op.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveConcat(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofconcat",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_concat",
+                inputs={'X': tindata},
+                attrs={
+                    'ring_id': ring_id,
+                    'rank': self.rank,
+                    'nranks': nranks
+                },
+                outputs={'Out': toutdata})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveConcat, "concat", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_identity_op.py b/python/paddle/fluid/tests/unittests/collective_identity_op.py
new file mode 100644
index 00000000000000..e024b64e825095
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_identity_op.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveIdentity(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofgather",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_identity",
+                inputs={'X': tindata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id,
+                       'nranks': nranks})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveIdentity, "identity", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective_sendrecv_api.py
new file mode 100644
index 00000000000000..551537a0ea4eab
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_sendrecv_api.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveSendRecvAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata",
+                shape=[10, 1000],
+                dtype='float32',
+                append_batch_size=False)
+            if rank == 0:
+                paddle.distributed.send(tindata, dst=1)
+            else:
+                paddle.distributed.recv(tindata, src=0)
+            return [tindata]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveSendRecvAPI, "sendrecv")
diff --git a/python/paddle/fluid/tests/unittests/collective_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective_sendrecv_api_dygraph.py
new file mode 100644
index 00000000000000..10028488e85a22
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_sendrecv_api_dygraph.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+
+
+class TestCollectiveSendRecvAPI(TestCollectiveAPIRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program, rank, indata=None):
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = paddle.to_tensor(indata)
+            if rank == 0:
+                paddle.distributed.send(tindata, dst=1)
+            else:
+                paddle.distributed.recv(tindata, src=0)
+            return [tindata.numpy()]
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveSendRecvAPI, "sendrecv")
diff --git a/python/paddle/fluid/tests/unittests/collective_split_op.py b/python/paddle/fluid/tests/unittests/collective_split_op.py
new file mode 100644
index 00000000000000..553955354fe029
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_split_op.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllGather(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofsplit",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_split",
+                inputs={'X': tindata},
+                attrs={
+                    'ring_id': ring_id,
+                    'rank': self.rank,
+                    'nranks': nranks
+                },
+                outputs={'Out': toutdata})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllGather, "split", 0)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
index 29b4f1b05f9c29..ea745ad6614253 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
@@ -31,6 +31,10 @@
 SEED = 2020
 np.random.seed(SEED)
 
+# TODO(zhhsplendid): This test is old so that use a static graph style
+# mark it as TODO, to refactoring the code of this file.
+paddle.enable_static()
+
 
 def dyfunc_to_variable(x):
     res = fluid.dygraph.to_variable(x, name=None, zero_copy=None)
@@ -54,11 +58,25 @@ def dyfunc_to_tensor(x):
     return res3
 
 
+def dyfunc_int_to_tensor(x):
+    res = paddle.to_tensor(3)
+    return res
+
+
+def dyfunc_float_to_tensor(x):
+    return paddle.to_tensor(2.0)
+
+
+def dyfunc_bool_to_tensor(x):
+    return paddle.to_tensor(True)
+
+
 class TestDygraphBasicApi_ToVariable(unittest.TestCase):
     def setUp(self):
         self.input = np.ones(5).astype("int32")
         self.test_funcs = [
-            dyfunc_to_tensor, dyfunc_to_variable, dyfunc_to_variable_2,
+            dyfunc_to_tensor, dyfunc_bool_to_tensor, dyfunc_int_to_tensor,
+            dyfunc_float_to_tensor, dyfunc_to_variable, dyfunc_to_variable_2,
             dyfunc_to_variable_3
         ]
         self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
index 1fee1c1ef6fdc3..07e9b1ac62e270 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
@@ -54,7 +54,7 @@ def __init__(self, process_cmd, d_model, dropout_rate):
                 self.functors.append(
                     self.add_sublayer(
                         "layer_norm_%d" % len(
-                            self.sublayers(include_sublayers=False)),
+                            [layer for layer in self.children()]),
                         LayerNorm(
                             normalized_shape=d_model,
                             param_attr=fluid.ParamAttr(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
index 528e388f6a2e2a..bb95bdf9fc6770 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
@@ -97,7 +97,7 @@ def __setattr__(self, name, value):
 # derived learning rate the to get the final learning rate.
 cfg.learning_rate = 0.001
 # maximum number of iterations
-cfg.max_iter = 20 if fluid.is_compiled_with_cuda() else 2
+cfg.max_iter = 20 if fluid.is_compiled_with_cuda() else 1
 # Disable mixup in last N iter
 cfg.no_mixup_iter = 10 if fluid.is_compiled_with_cuda() else 1
 # warm up to learning rate 
diff --git a/python/paddle/fluid/tests/unittests/hccl_tools.py b/python/paddle/fluid/tests/unittests/hccl_tools.py
index 3ae8f38dc64bd1..e3628ee5a4e9b4 100644
--- a/python/paddle/fluid/tests/unittests/hccl_tools.py
+++ b/python/paddle/fluid/tests/unittests/hccl_tools.py
@@ -58,7 +58,7 @@ def parse_args():
         default="[0,8)",
         help="The number of the Ascend accelerators used. please note that the Ascend accelerators"
         "used must be continuous, such [0,4) means to use four chips "
-        "0，1，2，3; [0,1) means to use chip 0; The first four chips are"
+        "0,1,2,3; [0,1) means to use chip 0; The first four chips are"
         "a group, and the last four chips are a group. In addition to"
         "the [0,8) chips are allowed, other cross-group such as [3,6)"
         "are prohibited.")
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
new file mode 100644
index 00000000000000..248c271eec6a1b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+from hybrid_parallel_mp_model import TestDistMPTraning
+import paddle.distributed.fleet as fleet
+import unittest
+
+
+class TestMPClipGrad(TestDistMPTraning):
+    def build_optimizer(self, model):
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
+        scheduler = paddle.optimizer.lr.ExponentialDecay(
+            learning_rate=0.001, gamma=0.999, verbose=True)
+        optimizer = paddle.optimizer.SGD(scheduler,
+                                         grad_clip=grad_clip,
+                                         parameters=model.parameters())
+        return optimizer
+
+    def train_batch(self, batch, model, optimizer, is_mp):
+        scaler = paddle.amp.GradScaler(init_loss_scaling=5160)
+        if is_mp:
+            scaler = fleet.distributed_scaler(scaler)
+        with paddle.amp.auto_cast():
+            output = model(batch)
+            loss = output.mean()
+
+        scaled = scaler.scale(loss)  # scale the loss
+        scaled.backward()  # do backward
+
+        scaler.minimize(optimizer, scaled)  # update parameters
+        optimizer.clear_grad()
+        return scaled
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py
new file mode 100644
index 00000000000000..ad95aceaa2cf9c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+from hybrid_parallel_mp_model import TestDistMPTraning
+import unittest
+import logging
+
+#log = logging.getLogger("HybridParallel")
+#log.setLevel(logging.WARNING)
+
+
+class TestMPClipGrad(TestDistMPTraning):
+    def build_optimizer(self, model):
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
+        scheduler = paddle.optimizer.lr.ExponentialDecay(
+            learning_rate=0.001, gamma=0.999, verbose=True)
+        optimizer = paddle.optimizer.SGD(scheduler,
+                                         grad_clip=grad_clip,
+                                         parameters=model.parameters())
+        return optimizer
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
index ed5b9060e5eba9..dfbef998a2f07a 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
@@ -173,9 +173,9 @@ def test_row_parallel_layer(self):
         self.word_size = self.hcg.get_model_parallel_world_size()
         self.rank_id = self.hcg.get_model_parallel_rank()
 
-        input_size_per_card = 17
+        input_size_per_card = 11
         input_size = input_size_per_card * self.model_parallel_size
-        output_size_per_card = 13
+        output_size_per_card = 10
         output_size = output_size_per_card * self.model_parallel_size
         batch_size = 4
 
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
new file mode 100644
index 00000000000000..767bf5d57e74af
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+import random
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+from paddle.io import DataLoader, Dataset
+import unittest
+
+
+def set_random_seed(seed, dp_id, rank_id):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    np.random.seed(seed + dp_id)
+    paddle.seed(seed + rank_id)
+
+
+vocab_size = 5
+hidden_size = 10
+inner_size = 8
+output_size = 2
+seq_length = 2
+
+
+class SimpleMPNet(fluid.dygraph.Layer):
+    def __init__(self, vocab_size, hidden_size, inner_size, output_size, np_fc1,
+                 np_fc2, mp_id):
+        super(SimpleMPNet, self).__init__()
+
+        if mp_id == 0:
+            init_fc1_data = np_fc1[:, :(inner_size // 2)]
+            init_fc2_data = np_fc2[:(inner_size // 2), :]
+        else:
+            init_fc1_data = np_fc1[:, (inner_size // 2):]
+            init_fc2_data = np_fc2[(inner_size // 2):, :]
+
+        self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
+            hidden_size,
+            inner_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(init_fc1_data)),
+            gather_output=False,
+            has_bias=True)
+
+        self.linear2 = fleet.meta_parallel.RowParallelLinear(
+            inner_size,
+            hidden_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(init_fc2_data)),
+            input_is_parallel=True,
+            has_bias=True)
+
+        self.linear3 = paddle.nn.Linear(
+            hidden_size,
+            output_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)))
+
+        self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
+            vocab_size,
+            hidden_size,
+            weight_attr=paddle.nn.initializer.Constant(value=0.5))
+
+    def forward(self, x):
+        x = self.embedding(x)
+        x = self.linear1(x)
+        x = self.linear2(x)
+        x = self.linear3(x)
+        return x
+
+
+class SimpleDPNet(fluid.dygraph.Layer):
+    def __init__(self, vocab_size, hidden_size, inner_size, output_size, np_fc1,
+                 np_fc2):
+
+        super(SimpleDPNet, self).__init__()
+        self.linear1 = paddle.nn.Linear(
+            hidden_size,
+            inner_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(np_fc1)),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)))
+
+        self.linear2 = paddle.nn.Linear(
+            inner_size,
+            hidden_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Assign(np_fc2)),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)))
+
+        self.linear3 = paddle.nn.Linear(
+            hidden_size,
+            output_size,
+            weight_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)),
+            bias_attr=paddle.framework.ParamAttr(
+                initializer=paddle.nn.initializer.Constant(0.0)))
+
+        self.embedding = paddle.nn.Embedding(
+            vocab_size,
+            hidden_size,
+            weight_attr=paddle.nn.initializer.Constant(value=0.5))
+
+    def forward(self, x):
+        x = self.embedding(x)
+        x = self.linear1(x)
+        x = self.linear2(x)
+        x = self.linear3(x)
+        return x
+
+
+class TrainDataset(Dataset):
+    def __init__(self, length):
+        self.length = length
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, index):
+        np_input_data = np.random.randint(0, vocab_size, (seq_length, ))
+        return np_input_data
+
+
+class TestDistMPTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 2
+        self.data_parallel_size = 1
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": 1
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def train_batch(self, batch, model, optimizer, is_mp):
+        output = model(batch)
+        loss = output.mean()
+        loss.backward()  # do backward
+        optimizer.step()  # update parameters
+        optimizer.clear_grad()
+        return loss
+
+    def build_optimizer(self, model):
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
+                                         parameters=model.parameters())
+        return optimizer
+
+    def build_model_optimizer(self):
+        hcg = fleet.get_hybrid_communicate_group()
+        word_size = hcg.get_model_parallel_world_size()
+        mp_id = hcg.get_model_parallel_rank()
+        dp_id = hcg.get_data_parallel_rank()
+        rank_id = dist.get_rank()
+        set_random_seed(1024, dp_id, rank_id)
+
+        np_fc1 = np.random.random_sample((hidden_size, inner_size))
+        np_fc2 = np.random.random_sample((inner_size, hidden_size))
+
+        train_data = TrainDataset(length=10000)
+
+        train_batch_sampler = paddle.io.DistributedBatchSampler(
+            train_data,
+            batch_size=4,
+            shuffle=False,
+            num_replicas=self.data_parallel_size,
+            rank=dp_id)
+        train_data_loader = DataLoader(
+            dataset=train_data,
+            batch_sampler=train_batch_sampler,
+            num_workers=0,
+            return_list=True)
+
+        model_a = SimpleMPNet(vocab_size, hidden_size, inner_size, output_size,
+                              np_fc1, np_fc2, mp_id)
+        optimizer_a = self.build_optimizer(model_a)
+        model_a = fleet.distributed_model(model_a)
+        optimizer_a = fleet.distributed_optimizer(optimizer_a)
+
+        model_b = SimpleDPNet(vocab_size, hidden_size, inner_size, output_size,
+                              np_fc1, np_fc2)
+        optimizer_b = self.build_optimizer(model_b)
+
+        return model_a, optimizer_a, model_b, optimizer_b, train_data_loader
+
+    def test_mp_model(self):
+        model_a, optimizer_a, model_b, optimizer_b, train_data_loader = self.build_model_optimizer(
+        )
+
+        for step, batch in enumerate(train_data_loader):
+            if step > 5:
+                return
+
+            loss_a = self.train_batch(batch, model_a, optimizer_a, True)
+            loss_b = self.train_batch(batch, model_b, optimizer_b, False)
+
+            np.testing.assert_allclose(
+                loss_a.numpy(), loss_b.numpy(), rtol=1e-5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
new file mode 100644
index 00000000000000..3130cbf458467a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+import paddle
+from paddle.distributed import fleet
+import copy
+from paddle.fluid.dygraph.container import Sequential
+import paddle.nn as nn
+from paddle.fluid.dygraph.layers import Layer
+from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
+import paddle.nn.functional as F
+import unittest
+
+
+class AlexNet(Layer):
+    def __init__(self, num_classes=10):
+        super(AlexNet, self).__init__()
+        self.features = Sequential(
+            nn.Conv2D(
+                3, 64, kernel_size=11, stride=4, padding=5),
+            nn.ReLU(),
+            nn.MaxPool2D(
+                kernel_size=2, stride=2),
+            nn.Conv2D(
+                64, 192, kernel_size=5, padding=2),
+            nn.ReLU(),
+            nn.MaxPool2D(
+                kernel_size=2, stride=2),
+            nn.Conv2D(
+                192, 384, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2D(
+                384, 256, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2D(
+                256, 256, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2D(
+                kernel_size=2, stride=2), )
+        self.classifier = nn.Linear(256, num_classes)
+        self.loss_fn = nn.loss.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        x = self.features(x)
+        x.flatten()
+
+        x = self.classifier(x)
+        return self.loss_fn(x, y)
+
+
+class AlexNetPipe(AlexNet):
+    def to_layers(self):
+        feat = [self.features[i] for i in range(len(self.features))]
+        loss_fn = [lambda x: x.flatten(), self.classifier]
+        feat.extend(loss_fn)
+        return feat
+
+
+class AlexNetPipeDesc(PipelineLayer):
+    def __init__(self, num_classes=10, **kwargs):
+        self.num_classes = num_classes
+        decs = [
+            LayerDesc(
+                nn.Conv2D, 3, 64, kernel_size=11, stride=4, padding=5),
+            LayerDesc(nn.ReLU),
+            LayerDesc(
+                nn.MaxPool2D, kernel_size=2, stride=2),
+            LayerDesc(
+                nn.Conv2D, 64, 192, kernel_size=5, padding=2),
+            F.relu,
+            LayerDesc(
+                nn.MaxPool2D, kernel_size=2, stride=2),
+            LayerDesc(
+                nn.Conv2D, 192, 384, kernel_size=3, padding=1),
+            F.relu,
+            LayerDesc(
+                nn.Conv2D, 384, 256, kernel_size=3, padding=1),
+            F.relu,
+            LayerDesc(
+                nn.Conv2D, 256, 256, kernel_size=3, padding=1),
+            F.relu,
+            LayerDesc(
+                nn.MaxPool2D, kernel_size=2, stride=2),
+            lambda x: x.flatten(),
+            LayerDesc(nn.Linear, 256, self.num_classes),  # classifier
+        ]
+        super(AlexNetPipeDesc, self).__init__(
+            layers=decs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
+
+
+class TestPipeLayerAPI(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": 1,
+            "mp_degree": 1,
+            "pp_degree": self.model_parallel_size
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+        self.hcg = fleet.get_hybrid_communicate_group()
+
+    def test_pipelayer_desc(self):
+        pipe_model = AlexNetPipeDesc(num_stages=self.model_parallel_size)
+        np.testing.assert_array_equal(len(pipe_model.parameters()), 6)
+
+    def test_pipelayer_sequential(self):
+        init_net = AlexNetPipe()
+        pipe_model = PipelineLayer(
+            layers=init_net.to_layers(),
+            num_stages=self.model_parallel_size,
+            loss_fn=nn.CrossEntropyLoss())
+        stage_id = self.hcg.get_stage_id()
+        init_parameters = init_net.parameters()
+        pipe_parameters = pipe_model.parameters()
+        part_number = len(init_parameters) // 2
+
+        if stage_id == 0:
+            for idx in range(part_number):
+                param_a = init_parameters[idx]
+                param_b = pipe_parameters[idx]
+                np.testing.assert_array_equal(param_a.name, param_b.name)
+                np.testing.assert_allclose(param_a.numpy(), param_b.numpy())
+
+        elif stage_id == 1:
+            for idx in range(part_number):
+                param_a = init_parameters[idx + part_number]
+                param_b = pipe_parameters[idx]
+
+                np.testing.assert_array_equal(param_a.name, param_b.name)
+                np.testing.assert_allclose(param_a.numpy(), param_b.numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_model.py
new file mode 100644
index 00000000000000..9b9283a1a9b6ea
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_model.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+import random
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+from paddle.io import DataLoader, Dataset
+import unittest
+
+
+def set_random_seed(seed, dp_id, rank_id):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    np.random.seed(seed + dp_id)
+    paddle.seed(seed + rank_id)
+
+
+HIDDEN_DIM = 32
+LAYERS = 8
+
+
+def sequential_model():
+    model = paddle.nn.Sequential(
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, 1), )
+    return model
+
+
+class TestDistPPTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 1
+        self.data_parallel_size = 1
+        self.pipeline_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": self.pipeline_parallel_size,
+        }
+        strategy.pipeline_configs = {"accumulate_steps": 2}
+        paddle.distributed.init_parallel_env()
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_mp_model(self):
+        batch_input = paddle.randn(shape=(1, HIDDEN_DIM), dtype="float32")
+        pipe_model = sequential_model()
+        sgd = paddle.optimizer.SGD(learning_rate=0.0003, parameters=[])
+        pipe_model = paddle.distributed.fleet.distributed_model(pipe_model)
+
+        if pipe_model.stage_id == 0 or pipe_model.stage_id == 1:
+            pipe_input = batch_input.clone().detach()
+            pipe_input = paddle.cast(pipe_input, 'float32')
+
+            def data_gen():
+                gen = True
+                while gen:
+                    yield [pipe_input, 0]
+                    gen = False
+
+            loader = paddle.io.DataLoader.from_generator(capacity=5)
+            loader.set_batch_generator(data_gen)
+            data_iter = iter(loader)
+        else:
+            data_iter = None
+        return True
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py
index 8bbba7c8b55fef..90cdf784b1fcf7 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py
@@ -70,7 +70,7 @@ def check_output(self):
             use_gpu = True
             atol = 1e-5
             if self.trt_parameters.precision == AnalysisConfig.Precision.Half:
-                atol = 1e-3
+                atol = 2e-2
             self.check_output_with_option(use_gpu, atol, flatten=True)
             self.assertTrue(
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
index 0821b390e5e6ae..ec3955a9ae1441 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
@@ -96,6 +96,7 @@ def setUp(self):
                 groups=self.conv_groups,
                 padding=self.conv_padding,
                 bias_attr=False,
+                use_cudnn=self.use_cudnn,
                 act=None)
         self.feeds = {
             "data": np.random.random([1, 6, 64, 64]).astype("float32"),
@@ -110,6 +111,7 @@ def set_params(self):
         self.conv_filter_size = 6
         self.conv_groups = 1
         self.conv_padding = [1, 1]
+        self.use_cudnn = True
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -126,6 +128,7 @@ def set_params(self):
         self.conv_filter_size = 6
         self.conv_groups = 1
         self.conv_padding = 'VALID'
+        self.use_cudnn = True
 
 
 class TensorRTSubgraphPassConvTransposeSamePaddingTest(
@@ -135,15 +138,27 @@ def set_params(self):
         self.conv_filter_size = 6
         self.conv_groups = 1
         self.conv_padding = 'SAME'
+        self.use_cudnn = True
 
 
-class TensorRTSubgraphPassDepthwiseConvTransposeTest(
+class TensorRTSubgraphPassConvTransposeMultiGroupTest(
         TensorRTSubgraphPassConvTransposeTest):
     def set_params(self):
         self.conv_num_filters = 6
         self.conv_filter_size = 6
-        self.conv_groups = 1
+        self.conv_groups = 2
+        self.conv_padding = [1, 1]
+        self.use_cudnn = True
+
+
+class TensorRTSubgraphPassDepthwiseConvTransposeTest(
+        TensorRTSubgraphPassConvTransposeTest):
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 4
+        self.conv_groups = 6
         self.conv_padding = [1, 1]
+        self.use_cudnn = False
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
index 48706bf5ad1fd9..3daa50020bab2e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
@@ -55,5 +55,182 @@ def test_check_output(self):
             self.check_output_with_option(use_gpu[i])
 
 
+class FCFusePassTRTDynamicDims2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[32, 128], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=1,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {"data": np.random.random((32, 128)).astype("float32")}
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims2Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims2Test.DynamicShapeParam(
+            {
+                'data': [1, 128]
+            }, {'data': [64, 128]}, {'data': [32, 128]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims3Cols1Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[32, 128, 32], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=1,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {"data": np.random.random((32, 128, 32)).astype("float32")}
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims3Cols1Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims3Cols1Test.DynamicShapeParam(
+            {
+                'data': [1, 128, 32]
+            }, {'data': [64, 128, 32]}, {'data': [32, 128, 32]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims3Cols2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[32, 128, 32], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=2,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {"data": np.random.random((32, 128, 32)).astype("float32")}
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims3Cols2Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims3Cols2Test.DynamicShapeParam(
+            {
+                'data': [1, 32, 32]
+            }, {'data': [64, 256, 32]}, {'data': [32, 128, 32]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims4Cols1Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[32, 12, 4, 6], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=1,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {
+            "data": np.random.random((32, 12, 4, 6)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims4Cols1Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims4Cols1Test.DynamicShapeParam(
+            {
+                'data': [1, 12, 4, 6]
+            }, {'data': [64, 12, 4, 6]}, {'data': [32, 12, 4, 6]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims4Cols2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[32, 128, 32, 32], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=2,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {
+            "data": np.random.random((32, 128, 32, 32)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims4Cols2Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims4Cols2Test.DynamicShapeParam(
+            {
+                'data': [1, 64, 32, 32]
+            }, {'data': [64, 256, 32, 32]}, {'data': [32, 128, 32, 32]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims4Cols3Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[32, 128, 32, 32], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=3,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {
+            "data": np.random.random((32, 128, 32, 32)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims4Cols3Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims4Cols3Test.DynamicShapeParam(
+            {
+                'data': [1, 128, 32, 32]
+            }, {'data': [64, 128, 32, 32]}, {'data': [32, 128, 32, 32]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_mkldnn_op.py
index e86273ea1c28ef..e740efa14c575b 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_mkldnn_op.py
@@ -198,4 +198,6 @@ def init_test_case(self):
 
 
 if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py
new file mode 100644
index 00000000000000..e3b0639289ab28
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_bilinear_interp_v2_mkldnn_op.py
@@ -0,0 +1,210 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
+
+
+def bilinear_interp_mkldnn_np(input,
+                              out_h,
+                              out_w,
+                              out_size=None,
+                              actual_shape=None,
+                              data_layout='NCHW'):
+    """bilinear interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        input = np.transpose(input, (0, 3, 1, 2))  # NHWC => NCHW
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+    batch_size, channel, in_h, in_w = input.shape
+
+    out = np.zeros((batch_size, channel, out_h, out_w))
+
+    for oh in range(out_h):
+        h0 = int(math.floor((oh + 0.5) * in_h / out_h - 0.5))
+        h1 = int(math.ceil((oh + 0.5) * in_h / out_h - 0.5))
+        h0 = max(h0, 0)
+        h1 = min(h1, in_h - 1)
+        Wh = (oh + 0.5) * in_h / out_h - 0.5 - h0
+        for ow in range(out_w):
+            w0 = int(math.floor((ow + 0.5) * in_w / out_w - 0.5))
+            w1 = int(math.ceil((ow + 0.5) * in_w / out_w - 0.5))
+            w0 = max(w0, 0)
+            w1 = min(w1, in_w - 1)
+            Ww = (ow + 0.5) * in_w / out_w - 0.5 - w0
+            input_h0_w0 = input[:, :, h0, w0]
+            input_h1_w0 = input[:, :, h1, w0]
+            input_h0_w1 = input[:, :, h0, w1]
+            input_h1_w1 = input[:, :, h1, w1]
+            out[:, :, oh, ow] = input_h0_w0 * (1 - Wh) * (
+                1 - Ww) + input_h1_w0 * Wh * (1 - Ww) + input_h0_w1 * (
+                    1 - Wh) * Ww + input_h1_w1 * Wh * Ww
+
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+
+    return out.astype(input.dtype)
+
+
+@skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.")
+class TestBilinearInterpMKLDNNOp(OpTest):
+    def init_test_case(self):
+        pass
+
+    def setUp(self):
+        self.op_type = "bilinear_interp_v2"
+        self.interp_method = 'bilinear'
+        self._cpu_only = True
+        self.use_mkldnn = True
+        self.input_shape = [1, 1, 2, 2]
+        self.data_layout = 'NCHW'
+        # priority: actual_shape > out_size > scale > out_h & out_w
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 2.0
+        self.out_size = None
+        self.actual_shape = None
+
+        self.init_test_case()
+
+        input_np = np.random.random(self.input_shape).astype("float32")
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        scale_h = 0
+        scale_w = 0
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                scale_h = float(self.scale)
+                scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = self.scale[0]
+                scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+
+        if scale_h > 0 and scale_w > 0:
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bilinear_interp_mkldnn_np(input_np, out_h, out_w,
+                                              self.out_size, self.actual_shape,
+                                              self.data_layout)
+
+        if isinstance(self.scale, float):
+            self.scale = [self.scale, self.scale]
+
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        self.attrs = {
+            'interp_method': self.interp_method,
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'scale': self.scale,
+            'data_layout': self.data_layout,
+            'use_mkldnn': self.use_mkldnn
+        }
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestBilinearInterpOpMKLDNNNHWC(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 27
+        self.out_w = 49
+        self.scale = [2.0, 3.0]
+        self.data_layout = 'NHWC'
+
+
+class TestBilinearNeighborInterpMKLDNNCase2(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+
+
+class TestBilinearNeighborInterpCase3(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.scale = [0.1, 0.05]
+
+
+class TestBilinearNeighborInterpCase4(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [13.0, 15.0]
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+class TestBilinearNeighborInterpCase5(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.out_size = np.array([13, 13]).astype("int32")
+
+
+class TestBilinearNeighborInterpCase6(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 1.0
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+class TestBilinearNeighborInterpSame(TestBilinearInterpMKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = 2.0
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
index c024ffbdb4b6af..7320efd259f459 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
@@ -19,7 +19,6 @@
 import struct
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
-from paddle.fluid.tests.unittests.op_test import OpTest
 from paddle.fluid.tests.unittests.test_fusion_gru_op import fusion_gru
 from paddle.fluid.tests.unittests.test_fusion_lstm_op import fc, ACTIVATION
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
index 1e4bfd5f0cf017..9f39826cb3ed28 100755
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_mkldnn_op.py
@@ -163,4 +163,6 @@ def init_test_case(self):
 
 
 if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
new file mode 100644
index 00000000000000..b608ca3af2f366
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
+
+
+def nearest_neighbor_interp_mkldnn_np(X,
+                                      out_h,
+                                      out_w,
+                                      out_size=None,
+                                      actual_shape=None,
+                                      data_layout='NCHW'):
+    """nearest neighbor interpolation implement in shape [N, C, H, W]"""
+    if data_layout == "NHWC":
+        X = np.transpose(X, (0, 3, 1, 2))  # NHWC => NCHW
+    if out_size is not None:
+        out_h = out_size[0]
+        out_w = out_size[1]
+    if actual_shape is not None:
+        out_h = actual_shape[0]
+        out_w = actual_shape[1]
+
+    n, c, in_h, in_w = X.shape
+
+    fh = fw = 0.0
+    if (out_h > 1):
+        fh = out_h * 1.0 / in_h
+    if (out_w > 1):
+        fw = out_w * 1.0 / in_w
+
+    out = np.zeros((n, c, out_h, out_w))
+
+    for oh in range(out_h):
+        ih = int(round((oh + 0.5) / fh - 0.5))
+        for ow in range(out_w):
+            iw = int(round((ow + 0.5) / fw - 0.5))
+            out[:, :, oh, ow] = X[:, :, ih, iw]
+
+    if data_layout == "NHWC":
+        out = np.transpose(out, (0, 2, 3, 1))  # NCHW => NHWC
+
+    return out.astype(X.dtype)
+
+
+@skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.")
+class TestNearestInterpV2MKLDNNOp(OpTest):
+    def init_test_case(self):
+        pass
+
+    def setUp(self):
+        self.op_type = "nearest_interp_v2"
+        self.interp_method = 'nearest'
+        self._cpu_only = True
+        self.use_mkldnn = True
+        self.input_shape = [1, 1, 2, 2]
+        self.data_layout = 'NCHW'
+        # priority: actual_shape > out_size > scale > out_h & out_w
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = [2.0, 3.0]
+        self.out_size = None
+        self.actual_shape = None
+
+        self.init_test_case()
+
+        input_np = np.random.random(self.input_shape).astype("float32")
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+
+        scale_h = 0
+        scale_w = 0
+
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                scale_h = float(self.scale)
+                scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = self.scale[0]
+                scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+
+        if scale_h > 0 and scale_w > 0:
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = nearest_neighbor_interp_mkldnn_np(
+            input_np, out_h, out_w, self.out_size, self.actual_shape,
+            self.data_layout)
+
+        if isinstance(self.scale, float):
+            self.scale = [self.scale]
+
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+        self.attrs = {
+            'interp_method': self.interp_method,
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'scale': self.scale,
+            'data_layout': self.data_layout,
+            'use_mkldnn': self.use_mkldnn
+        }
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestNearestInterpOpV2MKLDNNNHWC(TestNearestInterpV2MKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 27
+        self.out_w = 49
+        self.scale = [2.0, 3.0]
+        self.data_layout = 'NHWC'
+
+
+class TestNearestNeighborInterpV2MKLDNNCase2(TestNearestInterpV2MKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+
+
+class TestNearestNeighborInterpV2MKLDNNCase3(TestNearestInterpV2MKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 128
+        self.scale = [0.1, 0.05]
+
+
+class TestNearestNeighborInterpV2MKLDNNCase4(TestNearestInterpV2MKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [13.0, 15.0]
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+class TestNearestNeighborInterpV2MKLDNNSame(TestNearestInterpV2MKLDNNOp):
+    def init_test_case(self):
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.out_size = np.array([65, 129]).astype("int32")
+
+
+if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py
index a894d042e426c0..1d7ab4f6b33699 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_bf16_mkldnn_op.py
@@ -26,158 +26,182 @@
                  "place does not support BF16 evaluation")
 @unittest.skipIf(core.is_compiled_with_cuda(),
                  "core is compiled with CUDA which has no BF implementation")
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceSumDefaultBF16ONEDNNOp(OpTest):
+class TestReduceSumDefaultBF16OneDNNOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
-        x_fp32 = np.random.random((5, 6, 10)).astype("float32")
-        x_bf16 = convert_float_to_uint16(x_fp32)
-        self.inputs = {'X': x_bf16}
-        self.outputs = {'Out': x_fp32.sum(axis=0)}
+        self.x_fp32 = np.random.random((5, 6, 10)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.outputs = {'Out': self.x_fp32.sum(axis=0)}
         self.attrs = {'use_mkldnn': self.use_mkldnn}
 
     def test_check_output(self):
         self.check_output(check_dygraph=False)
 
+    def calculate_grads(self):
+        tmp_tensor = np.zeros(self.x_fp32.shape).astype("float32")
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceSum4DBF16ONEDNNOp(TestReduceSumDefaultBF16ONEDNNOp):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.use_mkldnn = True
-        x_fp32 = np.random.random((5, 10, 5, 5)).astype("float32")
-        x_bf16 = convert_float_to_uint16(x_fp32)
-        self.inputs = {'X': x_bf16}
-        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [2]}
-        self.outputs = {'Out': x_fp32.sum(axis=tuple(self.attrs['dim']))}
+        prod_of_reduced_dims = self.inputs['X'].shape[0]
+        axis = 0
 
+        if "dim" in self.attrs:
+            prod_of_reduced_dims = 1
+            axis = tuple(self.attrs['dim'])
+            for i in range(len(axis)):
+                ax = axis[i]
+                if axis[i] < 0:
+                    ax = len(axis) + axis[i]
+                prod_of_reduced_dims *= self.inputs['X'].shape[ax]
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceSum4DReduceAllWithoutReduceAllAttributeBF16ONEDNNOp(
-        TestReduceSumDefaultBF16ONEDNNOp):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.use_mkldnn = True
-        x_fp32 = np.random.normal(size=(2, 3, 5, 6)).astype('float32')
-        x_bf16 = convert_float_to_uint16(x_fp32)
-        self.inputs = {'X': x_bf16}
-        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [0, 1, 2, 3]}
-        self.outputs = {'Out': x_fp32.sum(axis=tuple(self.attrs['dim']))}
+        if 'reduce_all' in self.attrs:
+            if self.attrs['reduce_all'] is True:
+                axis = None
+                prod_of_reduced_dims = np.asarray(self.inputs['X'].shape).prod()
+
+        keepdim = False
+        if 'keep_dim' in self.attrs:
+            keepdim = True
+
+        self.grad_Out = self.x_fp32.sum(axis=axis, keepdims=keepdim)
+        self.grad_Out = np.atleast_1d(self.grad_Out)
+        self.grad_X = tmp_tensor + self.grad_Out  # broadcast grad
+
+        if self.op_type == 'reduce_mean':
+            self.grad_X /= prod_of_reduced_dims
+
+
+class TestReduceDefaultWithGradBF16OneDNNOp(TestReduceSumDefaultBF16OneDNNOp):
+    def test_check_grad(self):
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X"],
+            "Out",
+            check_dygraph=False,
+            user_defined_grads=[self.grad_X],
+            user_defined_grad_outputs=[convert_float_to_uint16(self.grad_Out)])
 
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceSum4DReduceAllWithoutReduceAllAttributeNegativeDimsBF16ONEDNNOp(
-        TestReduceSumDefaultBF16ONEDNNOp):
+class TestReduceSum4DReduceAllDimAttributeBF16OneDNNOp(
+        TestReduceDefaultWithGradBF16OneDNNOp):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
-        x_fp32 = np.random.normal(size=(2, 7, 3, 5)).astype('float32')
-        x_bf16 = convert_float_to_uint16(x_fp32)
-        self.inputs = {'X': x_bf16}
-        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [-1, -2, -3, -4]}
-        self.outputs = {'Out': x_fp32.sum(axis=tuple(self.attrs['dim']))}
+        self.x_fp32 = np.random.normal(size=(2, 3, 5, 6)).astype('float32')
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [0, 1, 2, 3]}
+        self.outputs = {'Out': self.x_fp32.sum(axis=tuple(self.attrs['dim']))}
 
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceSum5DKeepDimsONEDNNOp(TestReduceSumDefaultBF16ONEDNNOp):
+class TestReduceSum4DReduceAllWithoutReduceAllAttributeNegativeDimsBF16OneDNNOp(
+        TestReduceDefaultWithGradBF16OneDNNOp):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
-        x_fp32 = np.random.random((2, 5, 3, 2, 2)).astype("float32")
-        x_bf16 = convert_float_to_uint16(x_fp32)
-        self.inputs = {'X': x_bf16}
-        self.attrs = {'dim': (2, 3, 4), 'keep_dim': True, 'use_mkldnn': True}
-        self.outputs = {
-            'Out': x_fp32.sum(axis=tuple(self.attrs['dim']),
-                              keepdims=self.attrs['keep_dim'])
-        }
+        self.x_fp32 = np.random.normal(size=(4, 7, 6, 6)).astype('float32')
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [-1, -2, -3, -4]}
+        self.outputs = {'Out': self.x_fp32.sum(axis=tuple(self.attrs['dim']))}
 
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceSum5DReduceAllKeepDimsBF16ONEDNNOp(
-        TestReduceSumDefaultBF16ONEDNNOp):
+class TestReduceSum5DReduceAllKeepDimsBF16OneDNNOp(
+        TestReduceDefaultWithGradBF16OneDNNOp):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
-        x_fp32 = np.random.normal(size=(2, 5, 3, 2, 4)).astype('float32')
-        x_bf16 = convert_float_to_uint16(x_fp32)
-        self.inputs = {'X': x_bf16}
+        self.x_fp32 = np.random.normal(size=(2, 5, 3, 2, 5)).astype('float32')
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
         self.attrs = {'reduce_all': True, 'keep_dim': True, 'use_mkldnn': True}
-        self.outputs = {'Out': x_fp32.sum(keepdims=self.attrs['keep_dim'])}
+        self.outputs = {'Out': self.x_fp32.sum(keepdims=self.attrs['keep_dim'])}
 
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceSum4DReduceAllBF16ONEDNNOp(TestReduceSumDefaultBF16ONEDNNOp):
+class TestReduceSum4DReduceAllBF16OneDNNOp(
+        TestReduceDefaultWithGradBF16OneDNNOp):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
-        x_fp32 = np.random.normal(size=(4, 3, 2, 3)).astype('float32')
-        x_bf16 = convert_float_to_uint16(x_fp32)
-        self.inputs = {'X': x_bf16}
+        self.x_fp32 = np.random.normal(size=(4, 5, 4, 5)).astype('float32')
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
         self.attrs = {'reduce_all': True, 'use_mkldnn': self.use_mkldnn}
-        self.outputs = {'Out': x_fp32.sum()}
+        self.outputs = {'Out': self.x_fp32.sum()}
 
 
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
-class TestReduceMax3DBF16ONEDNNOp(TestReduceSumDefaultBF16ONEDNNOp):
+class TestReduceMax3DBF16OneDNNOp(TestReduceSumDefaultBF16OneDNNOp):
     """Remove Max with subgradient from gradient check to confirm the success of CI."""
 
     def setUp(self):
         self.op_type = "reduce_max"
         self.use_mkldnn = True
-        x_fp32 = np.random.random((5, 6, 10)).astype("float32")
-        x_bf16 = convert_float_to_uint16(x_fp32)
-        self.inputs = {'X': x_bf16}
+        self.x_fp32 = np.random.random((5, 6, 10)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
         self.attrs = {'dim': [-1], 'use_mkldnn': self.use_mkldnn}
-        self.outputs = {'Out': x_fp32.max(axis=tuple(self.attrs['dim']))}
+        self.outputs = {'Out': self.x_fp32.max(axis=tuple(self.attrs['dim']))}
 
 
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
-class TestReduceMax4DNegativeAndPositiveDimsBF16ONEDNNOp(
-        TestReduceSumDefaultBF16ONEDNNOp):
+class TestReduceMax4DNegativeAndPositiveDimsBF16OneDNNOp(
+        TestReduceSumDefaultBF16OneDNNOp):
     """Remove Max with subgradient from gradient check to confirm the success of CI."""
 
     def setUp(self):
         self.op_type = "reduce_max"
         self.use_mkldnn = True
-        x_fp32 = np.random.random((5, 6, 10, 9)).astype("float32")
-        x_bf16 = convert_float_to_uint16(x_fp32)
-        self.inputs = {'X': x_bf16}
+        self.x_fp32 = np.random.random((5, 6, 10, 9)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
         self.attrs = {'dim': [-1, 0, 1], 'use_mkldnn': self.use_mkldnn}
-        self.outputs = {'Out': x_fp32.max(axis=tuple(self.attrs['dim']))}
+        self.outputs = {'Out': self.x_fp32.max(axis=tuple(self.attrs['dim']))}
 
 
 @skip_check_grad_ci(
     reason="reduce_min is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
-class TestReduceMin3DBF16ONEDNNOp(TestReduceSumDefaultBF16ONEDNNOp):
+class TestReduceMin3DBF16OneDNNOp(TestReduceSumDefaultBF16OneDNNOp):
     """Remove Min with subgradient from gradient check to confirm the success of CI."""
 
     def setUp(self):
         self.op_type = "reduce_min"
         self.use_mkldnn = True
-        x_fp32 = np.random.random((5, 6, 10)).astype("float32")
-        x_bf16 = convert_float_to_uint16(x_fp32)
-        self.inputs = {'X': x_bf16}
+        self.x_fp32 = np.random.random((5, 6, 10)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
         self.attrs = {'dim': [2], 'use_mkldnn': self.use_mkldnn}
-        self.outputs = {'Out': x_fp32.min(axis=tuple(self.attrs['dim']))}
+        self.outputs = {'Out': self.x_fp32.min(axis=tuple(self.attrs['dim']))}
 
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceMean3DBF16ONEDNNOp(TestReduceSumDefaultBF16ONEDNNOp):
+class TestReduceMean3DBF16OneDNNOp(TestReduceDefaultWithGradBF16OneDNNOp):
     def setUp(self):
         self.op_type = "reduce_mean"
         self.use_mkldnn = True
-        x_fp32 = np.random.random((5, 6, 10)).astype("float32")
-        x_bf16 = convert_float_to_uint16(x_fp32)
-        self.inputs = {'X': x_bf16}
+        self.x_fp32 = np.random.random((5, 6, 10)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
         self.attrs = {'use_mkldnn': self.use_mkldnn}
-        self.outputs = {'Out': x_fp32.sum(axis=0) / x_fp32.shape[0]}
+        self.outputs = {'Out': self.x_fp32.sum(axis=0) / self.x_fp32.shape[0]}
+
+
+class TestReduceMean4DBF16OneDNNOp(TestReduceDefaultWithGradBF16OneDNNOp):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.use_mkldnn = True
+        self.x_fp32 = np.random.random((5, 6, 3, 5)).astype("float32")
+        self.x_bf16 = convert_float_to_uint16(self.x_fp32)
+        self.inputs = {'X': self.x_bf16}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [0, 1]}
+        self.outputs = {
+            'Out': self.x_fp32.sum(axis=tuple(self.attrs['dim'])) /
+            (self.x_fp32.shape[0] * self.x_fp32.shape[1])
+        }
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
index c913b9eeea27df..46ee2a14a20185 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reduce_mkldnn_op.py
@@ -19,8 +19,7 @@
 import paddle
 
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceSumDefaultONEDNNOp(OpTest):
+class TestReduceSumDefaultOneDNNOp(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
@@ -32,46 +31,35 @@ def test_check_output(self):
         self.check_output()
 
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceSum4DONEDNNOp(TestReduceSumDefaultONEDNNOp):
-    def setUp(self):
-        self.op_type = "reduce_sum"
-        self.use_mkldnn = True
-        self.inputs = {'X': np.random.random((5, 10, 5, 5)).astype("float32")}
-        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [2]}
-        self.outputs = {
-            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
-        }
+class TestReduceDefaultWithGradOneDNNOp(TestReduceSumDefaultOneDNNOp):
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
 
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceSum4DReduceAllWithoutReduceAllAttributeONEDNNOp(
-        TestReduceSumDefaultONEDNNOp):
+class TestReduceSum4DOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
         self.inputs = {'X': np.random.random((5, 10, 5, 5)).astype("float32")}
-        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [0, 1, 2, 3]}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [2]}
         self.outputs = {
             'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
         }
 
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceSum4DReduceAllWithoutReduceAllAttributeNegativeDimsONEDNNOp(
-        TestReduceSumDefaultONEDNNOp):
+class TestReduceSum4DReduceAllDimAttributeBF16OneDNNOp(
+        TestReduceDefaultWithGradOneDNNOp):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
-        self.inputs = {'X': np.random.random((5, 10, 5, 5)).astype("float32")}
-        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [-1, -2, -3, -4]}
+        self.inputs = {'X': np.random.random((5, 10, 5, 3)).astype("float32")}
+        self.attrs = {'use_mkldnn': self.use_mkldnn, 'dim': [0, 1, 2, 3]}
         self.outputs = {
             'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
         }
 
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceSum5DKeepDimsONEDNNOp(TestReduceSumDefaultONEDNNOp):
+class TestReduceSum5DKeepDimsOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
@@ -83,8 +71,8 @@ def setUp(self):
         }
 
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceSum5DReduceAllKeepDimsONEDNNOp(TestReduceSumDefaultONEDNNOp):
+class TestReduceSum5DReduceAllKeepDimsOneDNNOp(
+        TestReduceDefaultWithGradOneDNNOp):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
@@ -95,8 +83,7 @@ def setUp(self):
         }
 
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceSum4DReduceAllONEDNNOp(TestReduceSumDefaultONEDNNOp):
+class TestReduceSum4DReduceAllOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
     def setUp(self):
         self.op_type = "reduce_sum"
         self.use_mkldnn = True
@@ -108,7 +95,7 @@ def setUp(self):
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
-class TestReduceMax3DONEDNNOp(TestReduceSumDefaultONEDNNOp):
+class TestReduceMax3DOneDNNOp(TestReduceSumDefaultOneDNNOp):
     """Remove Max with subgradient from gradient check to confirm the success of CI."""
 
     def setUp(self):
@@ -124,8 +111,8 @@ def setUp(self):
 @skip_check_grad_ci(
     reason="reduce_max is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
-class TestReduceMax4DNegativeAndPositiveDimsONEDNNOp(
-        TestReduceSumDefaultONEDNNOp):
+class TestReduceMax4DNegativeAndPositiveDimsOneDNNOp(
+        TestReduceSumDefaultOneDNNOp):
     """Remove Max with subgradient from gradient check to confirm the success of CI."""
 
     def setUp(self):
@@ -141,7 +128,7 @@ def setUp(self):
 @skip_check_grad_ci(
     reason="reduce_min is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework.")
-class TestReduceMin3DONEDNNOp(TestReduceSumDefaultONEDNNOp):
+class TestReduceMin3DOneDNNOp(TestReduceSumDefaultOneDNNOp):
     """Remove Min with subgradient from gradient check to confirm the success of CI."""
 
     def setUp(self):
@@ -154,8 +141,7 @@ def setUp(self):
         }
 
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceMean3DONEDNNOp(TestReduceSumDefaultONEDNNOp):
+class TestReduceMean3DOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
     def setUp(self):
         self.op_type = "reduce_mean"
         self.use_mkldnn = True
@@ -166,8 +152,7 @@ def setUp(self):
         }
 
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceMean4DReduceAllONEDNNOp(TestReduceSumDefaultONEDNNOp):
+class TestReduceMean4DReduceAllOneDNNOp(TestReduceDefaultWithGradOneDNNOp):
     def setUp(self):
         self.op_type = "reduce_mean"
         self.use_mkldnn = True
@@ -179,16 +164,6 @@ def setUp(self):
         }
 
 
-@skip_check_grad_ci(reason="not implemented")
-class TestReduceMeanNoReduce1DOp(TestReduceSumDefaultONEDNNOp):
-    def setUp(self):
-        self.op_type = "reduce_mean"
-        self.use_mkldnn = True
-        self.inputs = {'X': np.random.random((1)).astype("float32")}
-        self.attrs = {'use_mkldnn': self.use_mkldnn}
-        self.outputs = {'Out': self.inputs['X']}
-
-
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
index ebf041388eeab9..ec616070b63abf 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
@@ -27,7 +27,7 @@
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
-class TestSGD(OpTest):
+class TestAdam(OpTest):
     def setUp(self):
         self.set_npu()
         self.place = paddle.NPUPlace(0)
@@ -78,9 +78,61 @@ def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
 
 
-'''
-# TODO(zhiqiu): The following test may let 0-3 card down.
-# we need to analyze it and open it.
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestAdamWithEpsilonTensor(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            'Beta1Tensor': np.array([beta1]).astype("float32"),
+            'Beta2Tensor': np.array([beta2]).astype("float32"),
+            'EpsilonTensor': np.array([epsilon]).astype("float32"),
+        }
+
+        self.attrs = {'epsilon': epsilon}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
+
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
@@ -140,9 +192,93 @@ def test_npu(self):
         cpu_pred, cpu_loss = self._test(False)
         npu_pred, npu_loss = self._test(True)
 
-        self.assertTrue(np.allclose(npu_pred, cpu_pred))
-        self.assertTrue(np.allclose(npu_loss, cpu_loss))
-'''
+        self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-4))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-4))
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNetWithEpsilonTensor(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            beta1_init = 0.9
+            beta2_init = 0.999
+            epsilon_init = 1e-8
+            beta1 = fluid.layers.create_global_var(
+                shape=[1],
+                value=float(beta1_init),
+                dtype='float32',
+                persistable=True,
+                name="beta1")
+            beta2 = fluid.layers.create_global_var(
+                shape=[1],
+                value=float(beta2_init),
+                dtype='float32',
+                persistable=True,
+                name="beta2")
+            epsilon = fluid.layers.create_global_var(
+                shape=[1],
+                value=float(epsilon_init),
+                dtype='float32',
+                persistable=True,
+                name="epsilon")
+            adam = fluid.optimizer.Adam(
+                learning_rate=0.01, beta1=beta1, beta2=beta2, epsilon=epsilon)
+            adam.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-4))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-4))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
index 4cda0ceeccf9c7..8828892dca3ccc 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
@@ -14,109 +14,133 @@
 
 import unittest
 import numpy as np
+import sys
+sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
 import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.contrib.mixed_precision.amp_nn import check_finite_and_unscale
 
 paddle.enable_static()
 
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
-class TestCheckFiniteAndUnscaleOp(OpTest):
-    def setUp(self):
-        self.set_npu()
-        self.op_type = "check_finite_and_unscale"
-        self.place = paddle.NPUPlace(0)
-        self.init_dtype()
-        x = np.random.random((1024, 1024)).astype(self.dtype)
-        scale = np.random.random((1)).astype(self.dtype)
-
-        self.inputs = {'X': [('x0', x)], 'Scale': scale}
-        self.outputs = {
-            'FoundInfinite': np.array([0]),
-            'Out': [('out0', x / scale)],
-        }
-
-    def set_npu(self):
-        self.__class__.use_npu = True
-
-    def init_kernel_type(self):
-        self.use_mkldnn = False
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
-
-
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
-class TestCheckFiniteAndUnscaleOpWithNan(OpTest):
-    def setUp(self):
-        self.set_npu()
-        self.op_type = "check_finite_and_unscale"
-        self.place = paddle.NPUPlace(0)
-        self.init_dtype()
-        x = np.random.random((1024, 1024)).astype(self.dtype)
-        x[128][128] = np.nan
-        scale = np.random.random((1)).astype(self.dtype)
-
-        self.inputs = {'X': [('x0', x)], 'Scale': scale}
-        self.outputs = {
-            'FoundInfinite': np.array([1]),
-            'Out': [('out0', x)],
-        }
-
-    def set_npu(self):
-        self.__class__.use_npu = True
-
-    def init_kernel_type(self):
-        self.use_mkldnn = False
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        # When input contains nan, do not check the output, 
-        # since the output may be nondeterministic and will be discarded.
-        self.check_output_with_place(
-            self.place, check_dygraph=False, no_check_set=['Out'])
+class TestCheckFiniteAndUnscale(unittest.TestCase):
+    def get_prog(self):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        with program_guard(main_program):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            scale = paddle.static.data(name="scale", shape=[1], dtype='float32')
+            float_status = paddle.static.data(
+                name="status", shape=[8], dtype='float32')
+            main_program.global_block().append_op(
+                type="alloc_float_status",
+                outputs={"FloatStatus": float_status}, )
+            c = paddle.fluid.layers.elementwise_div(a, b)
+            out, found_inf = check_finite_and_unscale(
+                [c], scale, float_status=float_status)
+
+        return main_program, out, found_inf, float_status
+
+    def run_prog(self, a, b, scale):
+        main_program, out, found_inf, float_status = self.get_prog()
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place)
+        out_, founf_inf_, float_status_ = exe.run(
+            main_program,
+            feed={"a": a,
+                  "b": b,
+                  "scale": scale},
+            fetch_list=[out, found_inf, float_status])
+        print(float_status_)
+        return out_, founf_inf_
+
+    def test_contains_nan(self):
+        a = np.zeros((32, 32)).astype('float32')
+        b = np.zeros((32, 32)).astype('float32')
+        scale = np.array([2.0]).astype('float32')
+
+        out, found_inf = self.run_prog(a, b, scale)
+        print(out, found_inf)
+
+        self.assertTrue(found_inf[0])
+
+    def test_contains_inf(self):
+        a = np.ones((32, 32)).astype('float32')
+        b = np.zeros((32, 32)).astype('float32')
+        scale = np.array([2.0]).astype('float32')
+
+        out, found_inf = self.run_prog(a, b, scale)
+        print(out, found_inf)
+
+        self.assertTrue(found_inf[0])
+
+    def test_not_contains_nan_inf(self):
+        a = np.ones((32, 32)).astype('float32')
+        b = np.ones((32, 32)).astype('float32')
+        scale = np.array([2.0]).astype('float32')
+
+        out, found_inf = self.run_prog(a, b, scale)
+        print(out, found_inf)
+
+        self.assertTrue(np.allclose(out, (a / b) / scale[0]))
+        self.assertFalse(found_inf[0])
 
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
-class TestCheckFiniteAndUnscaleOpWithInf(OpTest):
-    def setUp(self):
-        self.set_npu()
-        self.op_type = "check_finite_and_unscale"
-        self.place = paddle.NPUPlace(0)
-        self.init_dtype()
-        x = np.random.random((1024, 1024)).astype(self.dtype)
-        x[128][128] = np.inf
-        scale = np.random.random((1)).astype(self.dtype)
-
-        self.inputs = {'X': [('x0', x)], 'Scale': scale}
-        self.outputs = {
-            'FoundInfinite': np.array([1]),
-            'Out': [('out0', x)],
-        }
-
-    def set_npu(self):
-        self.__class__.use_npu = True
-
-    def init_kernel_type(self):
-        self.use_mkldnn = False
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        # When input contains inf, do not check the output, 
-        # since the output may be nondeterministic and will be discarded.
-        self.check_output_with_place(
-            self.place, check_dygraph=False, no_check_set=['Out'])
+class TestCheckFiniteAndUnscaleClearFloatStatus(unittest.TestCase):
+    def get_prog(self):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        with program_guard(main_program):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            scale = paddle.static.data(name="scale", shape=[1], dtype='float32')
+            float_status = paddle.static.data(
+                name="status", shape=[8], dtype='float32')
+            main_program.global_block().append_op(
+                type="alloc_float_status",
+                outputs={"FloatStatus": float_status}, )
+            c = paddle.fluid.layers.elementwise_div(a, b)
+            out, found_inf = check_finite_and_unscale(
+                [c], scale, float_status=float_status)
+            main_program.global_block().append_op(
+                type="alloc_float_status",
+                outputs={"FloatStatus": float_status}, )
+            d = paddle.fluid.layers.elementwise_add(a, b)
+            out, found_inf = check_finite_and_unscale(
+                [d], scale, float_status=float_status)
+
+        return main_program, out, found_inf, float_status
+
+    def run_prog(self, a, b, scale):
+        main_program, out, found_inf, float_status = self.get_prog()
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place)
+        out_, founf_inf_, float_status_ = exe.run(
+            main_program,
+            feed={"a": a,
+                  "b": b,
+                  "scale": scale},
+            fetch_list=[out, found_inf, float_status])
+        print(float_status_)
+        return out_, founf_inf_
+
+    def test_not_contains_nan_inf(self):
+        a = np.ones((32, 32)).astype('float32')
+        b = np.zeros((32, 32)).astype('float32')
+        scale = np.array([2.0]).astype('float32')
+
+        out, found_inf = self.run_prog(a, b, scale)
+        print(out, found_inf)
+
+        self.assertTrue(np.allclose(out, (a + b) / scale[0]))
+        self.assertFalse(found_inf[0])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
index b39771e29c7b47..ae48866b7b969d 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
@@ -73,5 +73,27 @@ def test_check_output(self):
         self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
 
 
+class TestCast3(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "cast"
+        self.place = paddle.NPUPlace(0)
+
+        ipt = np.random.random(size=[10, 10]) + 1
+        self.inputs = {'X': ipt.astype('int32')}
+        self.outputs = {'Out': ipt.astype('int32')}
+
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.INT32),
+            'out_dtype': int(core.VarDesc.VarType.INT32)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py
index a102f3d9ce185f..3e2e8f944b84c6 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py
@@ -26,7 +26,7 @@
 paddle.enable_static()
 SEED = 2021
 
-NPUPlace = 5
+NPUPlace = 0
 
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
diff --git a/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py
new file mode 100755
index 00000000000000..63c4fb8e5885ea
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py
@@ -0,0 +1,104 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid import compiler, Program, program_guard
+
+paddle.enable_static()
+SEED = 2021
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestMemcpy_FillConstant(unittest.TestCase):
+    def get_prog(self):
+        paddle.enable_static()
+        main_program = Program()
+        with program_guard(main_program):
+            cpu_var_name = "tensor@Cpu"
+            npu_var_name = "tensor@Npu"
+            cpu_var = main_program.global_block().create_var(
+                name=cpu_var_name,
+                shape=[10, 10],
+                dtype='float32',
+                persistable=False,
+                stop_gradient=True)
+            npu_var = main_program.global_block().create_var(
+                name=npu_var_name,
+                shape=[10, 10],
+                dtype='float32',
+                persistable=False,
+                stop_gradient=True)
+            main_program.global_block().append_op(
+                type="fill_constant",
+                outputs={"Out": npu_var_name},
+                attrs={
+                    "shape": [10, 10],
+                    "dtype": npu_var.dtype,
+                    "value": 1.0,
+                    "place_type": 1
+                })
+            main_program.global_block().append_op(
+                type="fill_constant",
+                outputs={"Out": cpu_var_name},
+                attrs={
+                    "shape": [10, 10],
+                    "dtype": cpu_var.dtype,
+                    "value": 0.0,
+                    "place_type": 2
+                })
+        return main_program, npu_var, cpu_var
+
+    def test_npu_cpoy_to_cpu(self):
+        main_program, npu_var, cpu_var = self.get_prog()
+        main_program.global_block().append_op(
+            type='memcpy',
+            inputs={'X': npu_var},
+            outputs={'Out': cpu_var},
+            attrs={'dst_place_type': 0})
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place)
+        npu_, cpu_ = exe.run(main_program,
+                             feed={},
+                             fetch_list=[npu_var.name, cpu_var.name])
+        self.assertTrue(np.allclose(npu_, cpu_))
+        self.assertTrue(np.allclose(cpu_, np.ones((10, 10))))
+
+    def test_cpu_cpoy_npu(self):
+        main_program, npu_var, cpu_var = self.get_prog()
+        main_program.global_block().append_op(
+            type='memcpy',
+            inputs={'X': cpu_var},
+            outputs={'Out': npu_var},
+            attrs={'dst_place_type': 4})
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place)
+        npu_, cpu_ = exe.run(main_program,
+                             feed={},
+                             fetch_list=[npu_var.name, cpu_var.name])
+        self.assertTrue(np.allclose(npu_, cpu_))
+        self.assertTrue(np.allclose(npu_, np.zeros((10, 10))))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
index e65a3dac73928c..4fcfd33b32f4e2 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
@@ -248,8 +248,9 @@ def test_npu(self):
         cpu_pred, cpu_loss = self._test(False)
         npu_pred, npu_loss = self._test(True)
 
-        self.assertTrue(np.allclose(npu_pred, cpu_pred))
-        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+        self.assertTrue(np.allclose(
+            npu_pred, cpu_pred, atol=1e-5))  # atol needed on cann 20.3
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-5))
 
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
index 087256b2980886..583a648224d730 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
@@ -16,6 +16,8 @@
 
 import unittest
 import numpy as np
+import sys
+sys.path.append("..")
 from op_test import OpTest, skip_check_grad_ci
 import paddle
 import paddle.fluid.core as core
diff --git a/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py b/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py
new file mode 100644
index 00000000000000..e7e7fb39c913b2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py
@@ -0,0 +1,108 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import sys
+sys.path.append("..")
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.nn import Embedding
+import paddle.fluid.framework as framework
+from paddle.fluid.optimizer import Adam
+from paddle.fluid.dygraph.base import to_variable
+from test_imperative_base import new_program_scope
+from paddle.fluid.executor import global_scope
+import numpy as np
+import six
+import pickle
+import os
+import errno
+from test_static_save_load import *
+
+paddle.enable_static()
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUSaveLoadBase(TestSaveLoadBase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUSaveLoadPartial(TestSaveLoadPartial):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUSaveLoadSetStateDict(TestSaveLoadSetStateDict):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUProgramStatePartial(TestProgramStatePartial):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPULoadFromOldInterface(TestLoadFromOldInterface):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPULoadFromOldInterfaceSingleFile(TestLoadFromOldInterfaceSingleFile):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUProgramStateOldSave(TestProgramStateOldSave):
+    def setUp(self):
+        self.test_dygraph = False
+
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestNPUProgramStateOldSaveSingleModel(TestProgramStateOldSaveSingleModel):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_npu(
+        ) else paddle.NPUPlace(0)
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 583bd3994bd0a4..25717b79677128 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -1171,7 +1171,9 @@ def find_actual(target_name, fetch_list):
                 expect = self.outputs[out_name]
                 expect_t = expect[0] if isinstance(expect, tuple) else expect
 
-                if actual_t.dtype == np.uint16 and expect_t.dtype == np.float32:
+                if actual_t.dtype == np.uint16 and expect_t.dtype in [
+                        np.float32, np.float64
+                ]:
                     actual_t = convert_uint16_to_float(actual_t)
                     atol = 0.03
 
diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py
index 37b446174d6d03..133367a5f3625a 100644
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
@@ -296,7 +296,7 @@ def check_grad_with_place(self,
             no_grad_set=no_grad_set)
         self._assert_is_close(a1, a2, inputs_to_check, 0.00000001,
                               "Gradient Check On two xpu")
-        self._assert_is_close(a1, a3, inputs_to_check, 0.001,
+        self._assert_is_close(a1, a3, inputs_to_check, max_relative_error,
                               "Gradient Check On cpu & xpu")
 
     def get_grad_with_place(self,
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
index 0d2631fa108d28..7002352240973e 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py
@@ -110,7 +110,8 @@ def test_multiple_gpus(self):
 
     def check_acc(self, grad, grad_sum, acc_grad):
         if grad is not None:
-            grad_sum = grad_sum + grad
+            grad_sum = grad_sum + grad.numpy()
+            acc_grad = acc_grad.numpy() if acc_grad is not None else None
             np.testing.assert_allclose(grad_sum, acc_grad, rtol=1e-6)
         return grad_sum
 
diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py
index 2e4b1828c5bbe6..1f02562dcb4fbe 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_net.py
@@ -173,7 +173,7 @@ def optimizer(learning_rate=0.01):
 def batch_size(use_device):
     if use_device == DeviceType.CUDA:
         # Paddle uses 8GB P4 GPU for unittest so we decreased the batch size.
-        return 8
+        return 4
     return 12
 
 
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py b/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py
new file mode 100644
index 00000000000000..416f6bc4f0d417
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py
@@ -0,0 +1,119 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+IN_SIZE = 2 * MODEL_PARALLEL_SIZE
+OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
+
+# Fix seed for test
+#fluid.default_startup_program().random_seed = 1
+#fluid.default_main_program().random_seed = 1
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    np_weight = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE)
+    if rank is not None:
+        start_col = 0 if rank == 0 else OUT_SIZE // 2
+        np_weight_part = np_weight[:, start_col:start_col + OUT_SIZE // 2]
+        result = paddle.distributed.split(
+            data,
+            size=(IN_SIZE, OUT_SIZE),
+            operation='linear',
+            axis=1,
+            num_partitions=MODEL_PARALLEL_SIZE,
+            weight_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    np_weight_part)),
+            bias_attr=False, )
+    else:
+        result = fluid.layers.fc(
+            data,
+            size=OUT_SIZE,
+            param_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(np_weight)),
+            bias_attr=False, )
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, IN_SIZE], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([IN_SIZE]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py b/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py
new file mode 100644
index 00000000000000..4a98792f8a0473
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py
@@ -0,0 +1,119 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+IN_SIZE = 2 * MODEL_PARALLEL_SIZE
+OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
+
+# Fix seed for test
+#fluid.default_startup_program().random_seed = 1
+#fluid.default_main_program().random_seed = 1
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    np_weight = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE)
+    if rank is not None:
+        start_row = 0 if rank == 0 else IN_SIZE // 2
+        np_weight_part = np_weight[start_row:start_row + IN_SIZE // 2, :]
+        result = paddle.distributed.split(
+            data,
+            size=(IN_SIZE, OUT_SIZE),
+            operation='linear',
+            axis=0,
+            num_partitions=MODEL_PARALLEL_SIZE,
+            weight_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    np_weight_part)),
+            bias_attr=False, )
+    else:
+        result = fluid.layers.fc(
+            data,
+            size=OUT_SIZE,
+            param_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(np_weight)),
+            bias_attr=False, )
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, IN_SIZE], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([IN_SIZE]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py b/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py
new file mode 100644
index 00000000000000..4a98792f8a0473
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py
@@ -0,0 +1,119 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+IN_SIZE = 2 * MODEL_PARALLEL_SIZE
+OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
+
+# Fix seed for test
+#fluid.default_startup_program().random_seed = 1
+#fluid.default_main_program().random_seed = 1
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    np_weight = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE)
+    if rank is not None:
+        start_row = 0 if rank == 0 else IN_SIZE // 2
+        np_weight_part = np_weight[start_row:start_row + IN_SIZE // 2, :]
+        result = paddle.distributed.split(
+            data,
+            size=(IN_SIZE, OUT_SIZE),
+            operation='linear',
+            axis=0,
+            num_partitions=MODEL_PARALLEL_SIZE,
+            weight_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    np_weight_part)),
+            bias_attr=False, )
+    else:
+        result = fluid.layers.fc(
+            data,
+            size=OUT_SIZE,
+            param_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(np_weight)),
+            bias_attr=False, )
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, IN_SIZE], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([IN_SIZE]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/test_Tensor_type.py b/python/paddle/fluid/tests/unittests/test_Tensor_type.py
new file mode 100644
index 00000000000000..59395b94279ea7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_Tensor_type.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+
+
+class TensorTypeTest(unittest.TestCase):
+    def test_type_totensor(self):
+        paddle.disable_static()
+        inx = np.array([1, 2])
+        tensorx = paddle.to_tensor(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+    def test_type_Tensor(self):
+        paddle.disable_static()
+        inx = np.array([1, 2])
+        tensorx = paddle.Tensor(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+        tensorx = paddle.tensor.logic.Tensor(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+    def test_type_core(self):
+        paddle.disable_static()
+        inx = np.array([1, 2])
+        tensorx = core.VarBase(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+        tensorx = paddle.framework.VarBase(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index ea183e9444878d..92465c3e28401a 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -119,6 +119,72 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
+class TestSilu(TestActivation):
+    def setUp(self):
+        self.op_type = "silu"
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = x / (np.exp(-x) + 1)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out')
+
+
+class TestSiluAPI(unittest.TestCase):
+    # test paddle.nn.Silu, paddle.nn.functional.silu
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [11, 17]).astype('float32')
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data('X', [11, 17])
+            out1 = F.silu(x)
+            m = paddle.nn.Silu()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = self.x_np / (1 + np.exp(-self.x_np))
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.silu(x)
+        m = paddle.nn.Silu()
+        out2 = m(x)
+        out_ref = self.x_np / (1 + np.exp(-self.x_np))
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.silu, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[11, 17], dtype='int32')
+            self.assertRaises(TypeError, F.silu, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[11, 17], dtype='float16')
+            F.silu(x_fp16)
+
+
 class TestLogSigmoid(TestActivation):
     def setUp(self):
         self.op_type = "logsigmoid"
@@ -2629,6 +2695,7 @@ def test_check_grad(self):
 
 create_test_act_fp16_class(TestActivation)
 create_test_act_fp16_class(TestSigmoid)
+create_test_act_fp16_class(TestSilu)
 create_test_act_fp16_class(TestLogSigmoid)
 create_test_act_fp16_class(TestTanh)
 create_test_act_fp16_class(TestTanhshrink)
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index f337e0079e7d93..cb646ef0b93213 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -402,6 +402,54 @@ def test_check_output(self):
         self.check_output()
 
 
+class TestAdamOpBetaEpsilonVariable(OpTest):
+    def setUp(self):
+        '''Test Adam Op with beta as Variable
+        '''
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+        beta1 = 0.85
+        beta2 = 0.95
+
+        learning_rate = 0.001
+        epsilon = 1e-8
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
+            "Beta1Tensor": np.array([beta1]).astype("float32"),
+            "Beta2Tensor": np.array([beta2]).astype("float32"),
+            "EpsilonTensor": np.array([epsilon]).astype("float32"),
+        }
+
+        attributes = {'epsilon': epsilon}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, attributes)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestAdamOpV2(unittest.TestCase):
     def test_adam_op(self):
         place = fluid.CPUPlace()
@@ -531,5 +579,121 @@ def test_adam_op_with_sparse_input_and_weight_decay(self):
             adam.step()
 
 
+class TestNetWithEpsilonTensor(unittest.TestCase):
+    def _test(self, place, use_tensor=True, use_fluid_api=True):
+        paddle.enable_static()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        SEED = 2021
+        paddle.seed(SEED)
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(32, 32)).astype('float32')
+        b_np = np.random.random(size=(32, 32)).astype('float32')
+        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64')
+
+            sum = paddle.add(a, b)
+            z = paddle.pow(sum, 2.0)
+
+            fc_1 = fluid.layers.fc(input=z, size=128)
+            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            beta1_init = 0.9
+            beta2_init = 0.999
+            epsilon_init = 1e-8
+            if use_tensor:
+                beta1 = fluid.layers.create_global_var(
+                    shape=[1],
+                    value=float(beta1_init),
+                    dtype='float32',
+                    persistable=True,
+                    name="beta1")
+                beta2 = fluid.layers.create_global_var(
+                    shape=[1],
+                    value=float(beta2_init),
+                    dtype='float32',
+                    persistable=True,
+                    name="beta2")
+                epsilon = fluid.layers.create_global_var(
+                    shape=[1],
+                    value=float(epsilon_init),
+                    dtype='float32',
+                    persistable=True,
+                    name="epsilon")
+                if use_fluid_api:
+                    adam = fluid.optimizer.Adam(
+                        learning_rate=0.01,
+                        beta1=beta1,
+                        beta2=beta2,
+                        epsilon=epsilon)
+                else:
+                    adam = paddle.optimizer.Adam(
+                        learning_rate=0.01,
+                        beta1=beta1,
+                        beta2=beta2,
+                        epsilon=epsilon)
+            else:
+                if use_fluid_api:
+                    adam = fluid.optimizer.Adam(
+                        learning_rate=0.01,
+                        beta1=beta1_init,
+                        beta2=beta2_init,
+                        epsilon=epsilon_init)
+                else:
+                    adam = fluid.optimizer.Adam(
+                        learning_rate=0.01,
+                        beta1=beta1_init,
+                        beta2=beta2_init,
+                        epsilon=epsilon_init)
+
+            adam.minimize(loss)
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(10):
+
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np,
+                      "b": b_np,
+                      "label": label_np},
+                fetch_list=[prediction, loss])
+
+        print("Epoch {} | Prediction[0]: {}, Loss: {}".format(epoch, pred_res[
+            0], loss_res))
+        paddle.disable_static()
+        return pred_res, loss_res
+
+    def _test_with_place(self, place):
+        preds = []
+        losses = []
+
+        for use_tensor in [True, False]:
+            for use_fluid_api in [True, False]:
+                pred, loss = self._test(place, use_tensor, use_fluid_api)
+                preds.append(pred)
+                losses.append(loss)
+        for pred in preds:
+            self.assertTrue(np.allclose(pred, preds[0]))
+        for loss in losses:
+            self.assertTrue(np.allclose(loss, losses[0]))
+
+    def test_adam_api(self):
+        # NOTE(zhiqiu): cpu and gpu has different seed, so should compare separatly.
+        self._test_with_place(paddle.CPUPlace())
+        if core.is_compiled_with_cuda():
+            self._test_with_place(paddle.CUDAPlace(0))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
index 82ddafb8f956f4..fe82b23b73bdb2 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
@@ -94,10 +94,8 @@ def test_errors(self):
             x3 = fluid.layers.data(name='x3', shape=[4], dtype="uint8")
             self.assertRaises(TypeError, fluid.layers.assign, x3)
             # When the type of input is numpy.ndarray, the dtype of input must be float32, int32.
-            x4 = np.array([[2.5, 2.5]], dtype='float64')
+            x4 = np.array([[2.5, 2.5]], dtype='uint8')
             self.assertRaises(TypeError, fluid.layers.assign, x4)
-            x5 = np.array([[2.5, 2.5]], dtype='uint8')
-            self.assertRaises(TypeError, fluid.layers.assign, x5)
 
 
 class TestAssignOApi(unittest.TestCase):
@@ -157,6 +155,23 @@ def test_assign_NumpyArray3(self):
             paddle.assign(array, result1)
         self.assertTrue(np.allclose(result1.numpy(), array))
 
+    def test_assign_List(self):
+        paddle.disable_static()
+        l = [1, 2, 3]
+        result = paddle.assign(l)
+        self.assertTrue(np.allclose(result.numpy(), np.array(l)))
+        paddle.enable_static()
+
+    def test_assign_BasicTypes(self):
+        paddle.disable_static()
+        result1 = paddle.assign(2)
+        result2 = paddle.assign(3.0)
+        result3 = paddle.assign(True)
+        self.assertTrue(np.allclose(result1.numpy(), np.array([2])))
+        self.assertTrue(np.allclose(result2.numpy(), np.array([3.0])))
+        self.assertTrue(np.allclose(result3.numpy(), np.array([1])))
+        paddle.enable_static()
+
 
 class TestAssignOpErrorApi(unittest.TestCase):
     def test_errors(self):
@@ -169,10 +184,8 @@ def test_errors(self):
             x3 = fluid.layers.data(name='x3', shape=[4], dtype="uint8")
             self.assertRaises(TypeError, paddle.assign, x3)
             # When the type of input is numpy.ndarray, the dtype of input must be float32, int32.
-            x4 = np.array([[2.5, 2.5]], dtype='float64')
+            x4 = np.array([[2.5, 2.5]], dtype='uint8')
             self.assertRaises(TypeError, paddle.assign, x4)
-            x5 = np.array([[2.5, 2.5]], dtype='uint8')
-            self.assertRaises(TypeError, paddle.assign, x5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op_npu.py b/python/paddle/fluid/tests/unittests/test_assign_op_npu.py
index f00a3c103c817c..ed21549b7e01fd 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op_npu.py
@@ -36,7 +36,7 @@ def setUp(self):
         self.op_type = "assign"
         self.init_dtype()
 
-        x = np.rand.random([3, 3])
+        x = np.random.random([3, 3]).astype(self.dtype)
         self.inputs = {'X': x}
 
         self.attrs = {}
@@ -46,7 +46,7 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def init_dtype(self):
-        self.dtype = np.int64
+        self.dtype = np.float32
 
     def test_check_output(self):
         self.check_output_with_place(self.place, check_dygraph=False)
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index e6e15575f2ca63..27c8869b21d827 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -349,7 +349,8 @@ def test_to_api(self):
                          paddle.fluid.core.VarDesc.VarType.FP64)
         self.assertEqual(self.linear.buf_name.dtype,
                          paddle.fluid.core.VarDesc.VarType.FP64)
-        self.assertTrue(np.allclose(self.linear.weight.grad, self.new_grad))
+        self.assertTrue(
+            np.allclose(self.linear.weight.grad.numpy(), self.new_grad))
         self.assertTrue(self.linear.weight._grad_ivar().dtype,
                         paddle.fluid.core.VarDesc.VarType.FP64)
 
@@ -358,7 +359,8 @@ def test_to_api(self):
                          paddle.fluid.core.VarDesc.VarType.FP64)
         self.assertEqual(self.linear.buf_name.dtype,
                          paddle.fluid.core.VarDesc.VarType.FP64)
-        self.assertTrue(np.allclose(self.linear.weight.grad, self.new_grad))
+        self.assertTrue(
+            np.allclose(self.linear.weight.grad.numpy(), self.new_grad))
         self.assertTrue(self.linear.weight._grad_ivar().dtype,
                         paddle.fluid.core.VarDesc.VarType.FP64)
 
diff --git a/python/paddle/fluid/tests/unittests/test_c_concat.py b/python/paddle/fluid/tests/unittests/test_c_concat.py
new file mode 100644
index 00000000000000..20f166af14c9ce
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_c_concat.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestConcatOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_concat(self, col_type="concat"):
+        self.check_with_place("collective_concat_op.py", col_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_c_identity.py b/python/paddle/fluid/tests/unittests/test_c_identity.py
new file mode 100644
index 00000000000000..c780f800d1ed53
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_c_identity.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestIdentityOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_identity(self, col_type="identity"):
+        self.check_with_place("collective_identity_op.py", col_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_c_split.py b/python/paddle/fluid/tests/unittests/test_c_split.py
new file mode 100644
index 00000000000000..0a5d91e0625e21
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_c_split.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestSplitOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_split(self, col_type="split"):
+        self.check_with_place("collective_split_op.py", col_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index 2946798a82f78f..1833c473d18a96 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -50,10 +50,14 @@ def setUp(self):
         self.outputs = {'Out': np.clip(self.inputs['X'], min_v, max_v)}
 
     def test_check_output(self):
+        paddle.enable_static()
         self.check_output()
+        paddle.disable_static()
 
     def test_check_grad_normal(self):
+        paddle.enable_static()
         self.check_grad(['X'], 'Out')
+        paddle.disable_static()
 
     def initTestCase(self):
         self.shape = (4, 10, 10)
@@ -102,6 +106,7 @@ def initTestCase(self):
 
 class TestClipOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             input_data = np.random.random((2, 4)).astype("float32")
 
@@ -115,9 +120,13 @@ def test_dtype():
                 fluid.layers.clip(x=x2, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_dtype)
+        paddle.disable_static()
 
 
 class TestClipAPI(unittest.TestCase):
+    def _executed_api(self, x, min=None, max=None):
+        return paddle.clip(x, min, max)
+
     def test_clip(self):
         paddle.enable_static()
         data_shape = [1, 9, 9, 4]
@@ -130,17 +139,22 @@ def test_clip(self):
         ) else fluid.CPUPlace()
         exe = fluid.Executor(place)
 
-        out_1 = paddle.clip(images, min=min, max=max)
-        out_2 = paddle.clip(images, min=0.2, max=0.9)
-        out_3 = paddle.clip(images, min=0.3)
-        out_4 = paddle.clip(images, max=0.7)
-        out_5 = paddle.clip(images, min=min)
-        out_6 = paddle.clip(images, max=max)
-        out_7 = paddle.clip(images, max=-1.)
-        out_8 = paddle.clip(images)
-        out_9 = paddle.clip(paddle.cast(images, 'float64'), min=0.2, max=0.9)
-
-        res1, res2, res3, res4, res5, res6, res7, res8, res9 = exe.run(
+        out_1 = self._executed_api(images, min=min, max=max)
+        out_2 = self._executed_api(images, min=0.2, max=0.9)
+        out_3 = self._executed_api(images, min=0.3)
+        out_4 = self._executed_api(images, max=0.7)
+        out_5 = self._executed_api(images, min=min)
+        out_6 = self._executed_api(images, max=max)
+        out_7 = self._executed_api(images, max=-1.)
+        out_8 = self._executed_api(images)
+        out_9 = self._executed_api(
+            paddle.cast(images, 'float64'), min=0.2, max=0.9)
+        out_10 = self._executed_api(
+            paddle.cast(images * 10, 'int32'), min=2, max=8)
+        out_11 = self._executed_api(
+            paddle.cast(images * 10, 'int64'), min=2, max=8)
+
+        res1, res2, res3, res4, res5, res6, res7, res8, res9, res10, res11 = exe.run(
             fluid.default_main_program(),
             feed={
                 "image": data,
@@ -148,7 +162,8 @@ def test_clip(self):
                 "max": np.array([0.8]).astype('float32')
             },
             fetch_list=[
-                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8, out_9
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8, out_9,
+                out_10, out_11
             ])
 
         self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
@@ -161,8 +176,14 @@ def test_clip(self):
         self.assertTrue(np.allclose(res8, data))
         self.assertTrue(
             np.allclose(res9, data.astype(np.float64).clip(0.2, 0.9)))
+        self.assertTrue(
+            np.allclose(res10, (data * 10).astype(np.int32).clip(2, 8)))
+        self.assertTrue(
+            np.allclose(res11, (data * 10).astype(np.int64).clip(2, 8)))
+        paddle.disable_static()
 
     def test_clip_dygraph(self):
+        paddle.disable_static()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         paddle.disable_static(place)
@@ -172,13 +193,24 @@ def test_clip_dygraph(self):
         v_min = paddle.to_tensor(np.array([0.2], dtype=np.float32))
         v_max = paddle.to_tensor(np.array([0.8], dtype=np.float32))
 
-        out_1 = paddle.clip(images, min=0.2, max=0.8)
-        out_2 = paddle.clip(images, min=0.2, max=0.9)
-        out_3 = paddle.clip(images, min=v_min, max=v_max)
+        out_1 = self._executed_api(images, min=0.2, max=0.8)
+        images = paddle.to_tensor(data, dtype='float32')
+        out_2 = self._executed_api(images, min=0.2, max=0.9)
+        images = paddle.to_tensor(data, dtype='float32')
+        out_3 = self._executed_api(images, min=v_min, max=v_max)
+
+        out_4 = self._executed_api(
+            paddle.cast(images * 10, 'int32'), min=2, max=8)
+        out_5 = self._executed_api(
+            paddle.cast(images * 10, 'int64'), min=2, max=8)
 
         self.assertTrue(np.allclose(out_1.numpy(), data.clip(0.2, 0.8)))
         self.assertTrue(np.allclose(out_2.numpy(), data.clip(0.2, 0.9)))
         self.assertTrue(np.allclose(out_3.numpy(), data.clip(0.2, 0.8)))
+        self.assertTrue(
+            np.allclose(out_4.numpy(), (data * 10).astype(np.int32).clip(2, 8)))
+        self.assertTrue(
+            np.allclose(out_5.numpy(), (data * 10).astype(np.int64).clip(2, 8)))
 
     def test_errors(self):
         paddle.enable_static()
@@ -186,6 +218,12 @@ def test_errors(self):
         x2 = fluid.data(name='x2', shape=[1], dtype="int8")
         self.assertRaises(TypeError, paddle.clip, x=x1, min=0.2, max=0.8)
         self.assertRaises(TypeError, paddle.clip, x=x2, min=0.2, max=0.8)
+        paddle.disable_static()
+
+
+class TestInplaceClipAPI(TestClipAPI):
+    def _executed_api(self, x, min=None, max=None):
+        return x.clip_(min, max)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
index a405da80adaf0f..eed2388f36ffe6 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
@@ -27,8 +27,14 @@ def _setup_config(self):
         pass
 
     def test_allreduce_nccl(self):
-        self.check_with_place("collective_allreduce_api.py", "allreduce",
-                              "nccl")
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.check_with_place("collective_allreduce_api.py", "allreduce",
+                                  "nccl")
+
+    def test_allreduce_bkcl(self):
+        if paddle.fluid.core.is_compiled_with_xpu():
+            self.check_with_place("collective_allreduce_api.py", "allreduce",
+                                  "bkcl")
 
     def test_allreduce_gloo(self):
         self.check_with_place("collective_allreduce_api.py", "allreduce",
diff --git a/python/paddle/fluid/tests/unittests/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/test_collective_alltoall_api.py
new file mode 100644
index 00000000000000..fab975a9d6249f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_alltoall_api.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveAllToAllAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_alltoall_nccl(self):
+        self.check_with_place("collective_alltoall_api.py", "alltoall", "nccl")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index 660018e285a852..e6693b676cf643 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -33,7 +33,7 @@
 
 
 class TestCollectiveAPIRunnerBase(object):
-    def get_model(self, train_prog, startup_prog, rank):
+    def get_model(self, train_prog, startup_prog, rank, indata=None):
         raise NotImplementedError(
             "get model should be implemented by child class.")
 
@@ -44,24 +44,31 @@ def run_trainer(self, args):
         rank = args["trainerid"]
         current_endpoint = args["currentendpoint"]
         nranks = 2
-        result = self.get_model(train_prog, startup_prog, rank)
         paddle.distributed.init_parallel_env()
         if args['backend'] == 'nccl':
             device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
             place = fluid.CUDAPlace(
                 device_id)  #if args.use_gpu else fluid.CPUPlace()
+        elif args['backend'] == 'bkcl':
+            device_id = int(os.getenv("FLAGS_selected_xpus", "0"))
+            place = fluid.XPUPlace(device_id)
         else:
             place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(startup_prog)
         np.random.seed(os.getpid())
         indata = np.random.random((10, 1000)).astype("float32")
-        fetch_list = []
-        for elem in result:
-            fetch_list.append(elem.name)
-        out = exe.run(train_prog,
-                      feed={'tindata': indata},
-                      fetch_list=fetch_list)
+        if args['static_mode']:
+            result = self.get_model(train_prog, startup_prog, rank)
+            exe = fluid.Executor(place)
+            exe.run(startup_prog)
+            fetch_list = []
+            for elem in result:
+                fetch_list.append(elem.name)
+            out = exe.run(train_prog,
+                          feed={'tindata': indata},
+                          fetch_list=fetch_list)
+        else:
+            out = self.get_model(train_prog, startup_prog, rank, indata)
+            #print(out, sys.stderr)
         if six.PY2:
             print(pickle.dumps(out))
         else:
@@ -71,7 +78,6 @@ def run_trainer(self, args):
 def runtime_main(test_class, col_type):
     args = {}
     model = test_class()
-    args["deviceid"] = os.getenv("FLAGS_selected_gpus")
     args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID"))
     args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM"))
     args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS')
@@ -79,6 +85,7 @@ def runtime_main(test_class, col_type):
     args["col_type"] = col_type
     args["backend"] = os.getenv("BACKEND")
     args["path_id"] = int(os.getenv("PATH_ID"))
+    args["static_mode"] = int(os.getenv("STATIC_MODE"))
     model.run_trainer(args)
 
 
@@ -112,21 +119,38 @@ def _run_cluster(self, model_file, envs):
         worker_endpoints = self._ps_endpoints.split(",")
         w0_ep, w1_ep = worker_endpoints
         #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
-        env0 = {
-            "FLAGS_selected_gpus": "0",
-            "PADDLE_TRAINER_ID": "0",
-            "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w0_ep
-        }
+        if core.is_compiled_with_cuda():
+            env0 = {
+                "FLAGS_selected_gpus": "0",
+                "PADDLE_TRAINER_ID": "0",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": w0_ep
+            }
 
-        env1 = {
-            "FLAGS_selected_gpus": "1",
-            "PADDLE_TRAINER_ID": "1",
-            "PADDLE_TRAINERS_NUM": "2",
-            "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w1_ep
-        }
+            env1 = {
+                "FLAGS_selected_gpus": "1",
+                "PADDLE_TRAINER_ID": "1",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": w1_ep
+            }
+        elif core.is_compiled_with_xpu():
+            env0 = {
+                "FLAGS_selected_xpus": "0",
+                "PADDLE_TRAINER_ID": "0",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": w0_ep
+            }
+
+            env1 = {
+                "FLAGS_selected_xpus": "1",
+                "PADDLE_TRAINER_ID": "1",
+                "PADDLE_TRAINERS_NUM": "2",
+                "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
+                "PADDLE_CURRENT_ENDPOINT": w1_ep
+            }
         #update environment
         env0.update(envs)
         env1.update(envs)
@@ -167,9 +191,13 @@ def check_with_place(self,
                          col_type,
                          backend="nccl",
                          path_id="0",
+                         static_mode="1",
                          check_error_log=False,
                          need_envs={}):
-        with_gloo = '0' if backend == "nccl" else '1'
+        if backend == "nccl" or backend == "bkcl":
+            with_gloo = '0'
+        else:
+            with_gloo = '1'
         required_envs = {
             "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
             "FLAGS_eager_delete_tensor_gb": "0.0",
@@ -177,8 +205,10 @@ def check_with_place(self,
             "PYTHONPATH": os.getenv("PYTHONPATH", ""),
             "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
             "LD_PRELOAD": os.getenv("LD_PRELOAD", ""),
-            "GLOG_v": "0",
+            "FLAGS_call_stack_level": "2",
+            "GLOG_v": "3",
             "NCCL_P2P_DISABLE": "1",
+            "STATIC_MODE": static_mode,
             "PADDLE_WITH_GLOO": with_gloo,
             "BACKEND": backend,
             "PATH_ID": path_id
@@ -247,5 +277,23 @@ def check_with_place(self,
             self.assertTrue(
                 np.allclose(
                     result_data, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "alltoall":
+            need_result1 = np.vstack((input1[0:input1.shape[0] // 2, :],
+                                      input2[0:input2.shape[0] // 2, :]))
+            need_result2 = np.vstack((input1[input1.shape[0] // 2:, :],
+                                      input2[input2.shape[0] // 2:, :]))
+            tr0_out = np.vstack(tr0_out)
+            tr1_out = np.vstack(tr1_out)
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result1, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result2, rtol=1e-05, atol=1e-05))
+        elif col_type == "sendrecv":
+            result_data = tr1_out[0]
+            self.assertTrue(
+                np.allclose(
+                    input1, result_data, rtol=1e-05, atol=1e-05))
         else:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_collective_base.py b/python/paddle/fluid/tests/unittests/test_collective_base.py
index fc267ed914ec29..697e8d32d67a88 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_base.py
@@ -274,6 +274,11 @@ def check_with_place(self,
             self.assertTrue(
                 np.allclose(
                     tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "identity":
+            need_result1 = input1
+            need_result2 = input2
+            self.assertTrue(np.allclose(tr0_out, need_result1, rtol=0, atol=0))
+            self.assertTrue(np.allclose(tr1_out, need_result2, rtol=0, atol=0))
         elif col_type == "reduce_slicegather":
             slicesize = input1.shape[0] // 2
             tmp10 = input1[0:slicesize]
@@ -284,5 +289,22 @@ def check_with_place(self,
             need_result2 = np.concatenate((tmp20, tmp21), axis=1)
             self.assertTrue(np.allclose(tr0_out, need_result1))
             self.assertTrue(np.allclose(tr1_out, need_result2))
+        elif col_type == "concat":
+            need_result = np.concatenate((input1, input2), axis=1)
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "split":
+            need_result1 = np.split(input1, 2, axis=1)[0]
+            need_result2 = np.split(input2, 2, axis=1)[1]
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result1, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result2, rtol=1e-05, atol=1e-05))
         else:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
index 8d28c794f023a6..721f446c9f0946 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
@@ -27,7 +27,12 @@ def _setup_config(self):
         pass
 
     def test_reduce_nccl(self):
-        self.check_with_place("collective_reduce_api.py", "reduce", "nccl")
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.check_with_place("collective_reduce_api.py", "reduce", "nccl")
+
+    def test_reduce_bkcl(self):
+        if paddle.fluid.core.is_compiled_with_xpu():
+            self.check_with_place("collective_reduce_api.py", "reduce", "bkcl")
 
     def test_reduce_gloo(self):
         self.check_with_place("collective_reduce_api.py", "reduce", "gloo", "1")
diff --git a/python/paddle/fluid/tests/unittests/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/test_collective_sendrecv_api.py
new file mode 100644
index 00000000000000..f1d5ec1300e0e1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_sendrecv_api.py
@@ -0,0 +1,44 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_api_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestCollectiveSendRecvAPI(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    #def test_sendrecv_nccl(self):
+    #    if paddle.fluid.core.is_compiled_with_cuda():
+    #        self.check_with_place("collective_sendrecv_api.py", "sendrecv",
+    #                              "nccl")
+
+    def test_sendrecv_nccl_dygraph(self):
+        if paddle.fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "collective_sendrecv_api_dygraph.py",
+                "sendrecv",
+                "nccl",
+                static_mode='0')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index fbf7384b86bc1c..8dc80c89312692 100644
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -122,6 +122,23 @@ def test_broadcast_api_1(self):
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
+        def test_broadcast_api_2(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[1, 2, 3], dtype='int32')
+                y = paddle.static.data(
+                    name='y', shape=[1, 2, 1, 3], dtype='int32')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
+                input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
         def test_attr_name(self):
             paddle.enable_static()
             with program_guard(Program(), Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index ea44e23da249ca..897d76a35dcabf 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -59,8 +59,8 @@ def cross_entropy_loss_1d(input,
     if reduction == 'sum':
         return np.sum(out), np.array([total_weight]).astype('float64')
     elif reduction == 'mean':
-        return out.sum() / total_weight, np.array(
-            [total_weight]).astype('float64')
+        out = out.sum() / total_weight if total_weight != 0 else out.sum()
+        return out, np.array([total_weight]).astype('float64')
     elif reduction == 'none':
         return out
 
@@ -92,8 +92,8 @@ def cross_entropy_loss_2d(input,
     if reduction == 'sum':
         return np.sum(out), np.array([total_weight]).astype('float64')
     elif reduction == 'mean':
-        return out.sum() / total_weight, np.array(
-            [total_weight]).astype('float64')
+        out = out.sum() / total_weight if total_weight != 0 else out.sum()
+        return out, np.array([total_weight]).astype('float64')
     elif reduction == 'none':
         return out
 
@@ -759,6 +759,45 @@ def test_cross_entropy_loss_1d_with_mean_ignore(self):
         self.assertTrue(np.allclose(static_ret, expected))
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
+    def test_cross_entropy_loss_1d_with_mean_ignore_negative(self):
+        N = 100
+        C = 200
+        input_np = np.random.random([N, C]).astype(self.dtype)
+        label_np = -np.ones((N)).astype(np.int64)
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[N, C], dtype=self.dtype)
+            label = fluid.data(name='label', shape=[N], dtype='int64')
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                ignore_index=-1)
+            ret = cross_entropy_loss(input, label)
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+
+        with fluid.dygraph.guard():
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                axis=1, ignore_index=-1)
+            dy_ret = cross_entropy_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(input_np, label_np, ignore_index=-1)[0]
+
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
     def test_cross_entropy_loss_1d_with_weight_mean_ignore(self):
         N = 100
         C = 200
diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
index a7472e7ffd7609..623b7e68b3f7f7 100644
--- a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
+++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
@@ -46,7 +46,7 @@ def test_tensor_backward(self):
 
                     x_grad = np.matmul(grad, y.T)
 
-                    self.assertTrue(np.allclose(x_grad, x_tensor.grad))
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
 
 
 class TestBackwardAPI(unittest.TestCase):
@@ -75,7 +75,8 @@ def test_backward_api(self):
 
                     x_grad = np.matmul(grad, y.T)
 
-                    self.assertTrue(np.allclose(x_grad * 2, x_tensor.grad))
+                    self.assertTrue(
+                        np.allclose(x_grad * 2, x_tensor.grad.numpy()))
 
     def test_backward_single_tensor(self):
         for dtype in self._dtypes:
@@ -94,7 +95,7 @@ def test_backward_single_tensor(self):
 
                     x_grad = np.matmul(grad, y.T)
 
-                    self.assertTrue(np.allclose(x_grad, x_tensor.grad))
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
 
     def test_backward_none_grad_tensor(self):
         for dtype in self._dtypes:
@@ -112,7 +113,7 @@ def test_backward_none_grad_tensor(self):
 
                     x_grad = np.matmul(grad, y.T)
 
-                    self.assertTrue(np.allclose(x_grad, x_tensor.grad))
+                    self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
index 97b6594eb38250..7dc5dc70618e66 100755
--- a/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
+++ b/python/paddle/fluid/tests/unittests/test_deprecated_decorator.py
@@ -21,6 +21,8 @@
 import unittest
 import paddle.fluid.core as core
 import sys
+import warnings
+import paddle.utils.deprecated as deprecated
 
 LOWEST_WARNING_POSTION = 3
 ERROR_WARNING_POSTION = sys.maxsize
@@ -149,6 +151,45 @@ def test_ops_elementwise_mul(self):
         # testting
         self.assertGreater(expected, captured)
 
+    def test_tensor_gradient(self):
+        paddle.__version__ = '2.1.0'
+
+        x = paddle.to_tensor(5., stop_gradient=False)
+        y = paddle.pow(x, 4.0)
+        y.backward()
+
+        with warnings.catch_warnings(record=True) as w:
+            grad = x.gradient()
+            assert (
+                'API "paddle.fluid.dygraph.varbase_patch_methods.gradient" is '
+                'deprecated since 2.1.0') in str(w[-1].message)
+
+    def test_softmax_with_cross_entropy(self):
+        paddle.__version__ = '2.0.0'
+
+        data = np.random.rand(128).astype("float32")
+        label = np.random.rand(1).astype("int64")
+        data = paddle.to_tensor(data)
+        label = paddle.to_tensor(label)
+        linear = paddle.nn.Linear(128, 100)
+        x = linear(data)
+
+        with warnings.catch_warnings(record=True) as w:
+            out = paddle.nn.functional.softmax_with_cross_entropy(
+                logits=x, label=label)
+            assert (
+                'API "paddle.nn.functional.loss.softmax_with_cross_entropy" is '
+                'deprecated since 2.0.0') in str(w[-1].message)
+
+    def test_deprecated_error(self):
+        paddle.__version__ = '2.1.0'
+
+        @deprecated(since="2.1.0", level=2)
+        def deprecated_error_func():
+            pass
+
+        self.assertRaises(RuntimeError, deprecated_error_func)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_detach.py b/python/paddle/fluid/tests/unittests/test_detach.py
index 38cdd9b727fc5c..5a31418205c329 100644
--- a/python/paddle/fluid/tests/unittests/test_detach.py
+++ b/python/paddle/fluid/tests/unittests/test_detach.py
@@ -152,18 +152,8 @@ def test_NoDetachSingle_DetachMulti(self):
     def test_detach_exception(self):
         x = fluid.layers.data(name="a", shape=[3, 4], dtype='float32')
         y = fluid.layers.fc(input=x, size=10, bias_attr=True)
-        try:
+        with self.assertRaises(AssertionError):
             y_detach = y.detach()
-        except Exception as e:
-            # Here is to check
-            assert type(e) == AssertionError
-            assert str(e) == (
-                "'detach' should be called by imperative Varible "
-                "in imperative mode, please run it in dygraph mode. You can "
-                "turn off paddle.enable_static() if you are in static mode, "
-                "or turn off ProgramTranslator if you are using "
-                "@paddle.jit.to_static. If you have to run ProgramTranslator, "
-                "please use other API to replace 'detach'")
 
 
 class TestInplace(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
index d615f7cb7044e5..f3878dfa2bc76f 100644
--- a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
@@ -119,8 +119,8 @@ def check_dgc_momentum_optimizer(self,
         init_ops_count = 5 if name == "momentum" else 9
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), init_ops_count)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[-1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate)
 
         # check dgc op regularization coeff
         train_ops = program.global_block().ops
diff --git a/python/paddle/fluid/tests/unittests/test_distribution.py b/python/paddle/fluid/tests/unittests/test_distribution.py
index d5790811df94f3..f1c12c90490c25 100644
--- a/python/paddle/fluid/tests/unittests/test_distribution.py
+++ b/python/paddle/fluid/tests/unittests/test_distribution.py
@@ -301,6 +301,41 @@ def init_static_data(self, batch_size, dims):
                 name='values', shape=[dims], dtype='float32')
 
 
+class UniformTest10(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are list.
+        self.low_np = np.random.randn(batch_size,
+                                      dims).astype('float32').tolist()
+        self.high_np = np.random.uniform(
+            5.0, 15.0, (batch_size, dims)).astype('float32').tolist()
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class UniformTest11(UniformTest):
+    def init_numpy_data(self, batch_size, dims):
+        # low and high are tuple.
+        self.low_np = tuple(
+            np.random.randn(batch_size, dims).astype('float32').tolist())
+        self.high_np = tuple(
+            np.random.uniform(5.0, 15.0, (batch_size, dims)).astype('float32')
+            .tolist())
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+
+    def init_static_data(self, batch_size, dims):
+        self.static_low = self.low_np
+        self.static_high = self.high_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
 class NormalNumpy(DistributionNumpy):
     def __init__(self, loc, scale):
         self.loc = np.array(loc)
@@ -673,6 +708,66 @@ def init_static_data(self, batch_size, dims):
                 name='other_scale', shape=[dims], dtype='float64')
 
 
+class NormalTest9(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are list.
+        self.loc_np = np.random.randn(batch_size,
+                                      dims).astype('float32').tolist()
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = self.scale_np.tolist()
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = np.random.randn(batch_size,
+                                            dims).astype('float32').tolist()
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.other_scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+        self.other_scale_np = self.other_scale_np.tolist()
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
+class NormalTest10(NormalTest):
+    def init_numpy_data(self, batch_size, dims):
+        # loc and scale are tuple.
+        self.loc_np = tuple(
+            np.random.randn(batch_size, dims).astype('float32').tolist())
+        self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        while not np.all(self.scale_np > 0):
+            self.scale_np = np.random.randn(batch_size, dims).astype('float32')
+        self.scale_np = tuple(self.scale_np.tolist())
+        self.values_np = np.random.randn(batch_size, dims).astype('float32')
+        # used to construct another Normal object to calculate kl_divergence
+        self.other_loc_np = tuple(
+            np.random.randn(batch_size, dims).astype('float32').tolist())
+        self.other_scale_np = np.random.randn(batch_size,
+                                              dims).astype('float32')
+        while not np.all(self.other_scale_np > 0):
+            self.other_scale_np = np.random.randn(batch_size,
+                                                  dims).astype('float32')
+        self.other_scale_np = tuple(self.other_scale_np.tolist())
+
+    def init_static_data(self, batch_size, dims):
+        self.static_loc = self.loc_np
+        self.static_scale = self.scale_np
+        self.static_other_loc = self.other_loc_np
+        self.static_other_scale = self.other_scale_np
+        with fluid.program_guard(self.test_program):
+            self.static_values = layers.data(
+                name='values', shape=[dims], dtype='float32')
+
+
 class CategoricalNumpy(DistributionNumpy):
     def __init__(self, logits):
         self.logits = np.array(logits).astype('float32')
@@ -961,6 +1056,38 @@ def get_numpy_selected_probs(self, probability):
         return np_probs
 
 
+class CategoricalTest8(CategoricalTest):
+    def init_dynamic_data(self, batch_size, dims):
+        # input logtis is 2-D list
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits = self.logits_np.tolist()
+        self.other_logits = self.other_logits_np.tolist()
+        self.value = paddle.to_tensor(self.value_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = self.logits_np.tolist()
+            self.other_logits_static = self.other_logits_np.tolist()
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+
+class CategoricalTest9(CategoricalTest):
+    def init_dynamic_data(self, batch_size, dims):
+        # input logtis is 2-D tuple
+        # value used in probs and log_prob method is 1-D Tensor
+        self.logits = tuple(self.logits_np.tolist())
+        self.other_logits = tuple(self.other_logits_np.tolist())
+        self.value = paddle.to_tensor(self.value_np)
+
+    def init_static_data(self, batch_size, dims):
+        with fluid.program_guard(self.test_program):
+            self.logits_static = tuple(self.logits_np.tolist())
+            self.other_logits_static = tuple(self.other_logits_np.tolist())
+            self.value_static = fluid.data(
+                name='value', shape=self.value_shape, dtype='int64')
+
+
 class DistributionTestError(unittest.TestCase):
     def test_distribution_error(self):
         distribution = Distribution()
diff --git a/python/paddle/fluid/tests/unittests/test_dot_op.py b/python/paddle/fluid/tests/unittests/test_dot_op.py
index f65301f2d8697b..a92104a5a6f493 100644
--- a/python/paddle/fluid/tests/unittests/test_dot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dot_op.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 import unittest
 import numpy as np
 from op_test import OpTest, skip_check_grad_ci
@@ -39,13 +40,33 @@ def test_check_output(self):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out')
+        if core.is_compiled_with_rocm():
+            self.check_grad(
+                ['X', 'Y'],
+                'Out',
+                user_defined_grads=[self.inputs['Y'], self.inputs['X']])
+        else:
+            self.check_grad(['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+        if core.is_compiled_with_rocm():
+            self.check_grad(
+                ['Y'],
+                'Out',
+                no_grad_set=set("X"),
+                user_defined_grads=[self.inputs['X']])
+        else:
+            self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+        if core.is_compiled_with_rocm():
+            self.check_grad(
+                ['X'],
+                'Out',
+                no_grad_set=set('Y'),
+                user_defined_grads=[self.inputs['Y']])
+        else:
+            self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
 
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [121]).astype(self.dtype)
@@ -64,6 +85,15 @@ def init_input_output(self):
             [11, 12])
         self.out = np.sum(self.x * self.y, axis=1).reshape([11, 1])
 
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+
 
 class TestDotOpError(unittest.TestCase):
     def test_errors(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index ba2abd72500788..89755d0365f2cb 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -303,6 +303,12 @@ def check_static_result(self, place):
                 mode='downscale_in_infer')
             res10 = paddle.nn.functional.dropout(x=input, p=1., training=True)
             res11 = paddle.fluid.layers.dropout(x=input, dropout_prob=0.)
+            res12 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.,
+                axis=(0, 1),
+                training=False,
+                mode='upscale_in_train')
 
             in_np = np.random.random([40, 40]).astype("float32")
             res_np = in_np
@@ -310,7 +316,8 @@ def check_static_result(self, place):
 
             exe = fluid.Executor(place)
             res_list = [
-                res1, res2, res3, res4, res5, res6, res7, res8, res9, res11
+                res1, res2, res3, res4, res5, res6, res7, res8, res9, res11,
+                res12
             ]
             for res in res_list:
                 fetches = exe.run(fluid.default_main_program(),
@@ -388,9 +395,16 @@ def test_dygraph(self):
                     x=input, p=1., training=True)
                 dropout = paddle.fluid.dygraph.Dropout(p=0, )
                 res11 = dropout(input)
+                res12 = paddle.nn.functional.dropout(
+                    x=input,
+                    p=0.,
+                    axis=(0, 1),
+                    training=False,
+                    mode='upscale_in_train')
 
             res_list = [
-                res1, res2, res3, res4, res5, res6, res7, res8, res9, res11
+                res1, res2, res3, res4, res5, res6, res7, res8, res9, res11,
+                res12
             ]
             for res in res_list:
                 self.assertTrue(np.allclose(res.numpy(), res_np))
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
new file mode 100755
index 00000000000000..6de04c14bfa708
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.autograd import PyLayer
+from paddle.distributed.fleet.utils import recompute
+import random
+
+import paddle.fluid.layers as layers
+
+
+def get_fc_block(block_idx, input_size, is_last=False):
+    block_name = "block_" + str(block_idx)
+    block = paddle.nn.Sequential(
+        (block_name + "_fc_0", paddle.nn.Linear(
+            input_size, input_size, bias_attr=False)),
+        (block_name + "_dropout", paddle.nn.Dropout(p=0.5)),
+        (block_name + "_relu_1", paddle.nn.ReLU()),
+        (block_name + "_fc_1", paddle.nn.Linear(
+            input_size, input_size, bias_attr=False)),
+        (block_name + "_relu_2", paddle.nn.ReLU()), )
+    if is_last:
+        block.add_sublayer(
+            block_name + "_fc_2",
+            paddle.nn.Linear(
+                input_size, 1, bias_attr=False))  # add sublayer
+    else:
+        block.add_sublayer(
+            block_name + "_fc_2",
+            paddle.nn.Linear(
+                input_size, input_size, bias_attr=False))  # add sublayer
+    return block
+
+
+class Naive_fc_net(paddle.nn.Layer):
+    def __init__(self,
+                 input_size=10,
+                 recompute_blocks=[1, 3],
+                 recompute_kwargs={}):
+        super(Naive_fc_net, self).__init__()
+        self.recompute_blocks = recompute_blocks
+        self.recompute_kwargs = recompute_kwargs
+        self.runfunc0 = get_fc_block(0, input_size, is_last=False)
+        self.runfunc1 = get_fc_block(1, input_size, is_last=False)
+        self.runfunc2 = get_fc_block(2, input_size, is_last=False)
+        self.runfunc3 = get_fc_block(3, input_size, is_last=False)
+        self.runfunc4 = get_fc_block(4, input_size, is_last=True)
+
+    def forward(self, inputs):
+
+        if 0 in self.recompute_blocks:
+            inputs = recompute(self.runfunc0, inputs)
+        else:
+            inputs = self.runfunc0(inputs)
+
+        if 1 in self.recompute_blocks:
+            inputs = recompute(self.runfunc1, inputs)
+        else:
+            inputs = self.runfunc1(inputs)
+
+        if 2 in self.recompute_blocks:
+            inputs = recompute(self.runfunc2, inputs, **self.recompute_kwargs)
+        else:
+            inputs = self.runfunc2(inputs)
+
+        if 3 in self.recompute_blocks:
+            inputs = recompute(self.runfunc3, inputs)
+        else:
+            inputs = self.runfunc3(inputs)
+
+        if 4 in self.recompute_blocks:
+            inputs = recompute(self.runfunc4, inputs)
+        else:
+            inputs = self.runfunc4(inputs)
+
+        return inputs
+
+
+def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
+    gen = paddle.seed(10)
+    gen.manual_seed(10)
+    np.random.seed(10)
+    random.seed(10)
+
+    if cuda_state:
+        paddle.set_cuda_rng_state(cuda_state)
+
+    batch_size, input_size = 1, 10
+    model = Naive_fc_net(
+        input_size,
+        recompute_blocks=recompute_block,
+        recompute_kwargs=recompute_kwargs)
+    loss_fn = paddle.nn.MSELoss(reduction='mean')
+    optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                     parameters=model.parameters())
+
+    loss_ = []
+    param_ = []
+    grad_ = []
+    for step in range(10):
+        x_data = np.random.randn(batch_size, input_size).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        # x.stop_gradient = False
+        y_pred = model(x)
+        loss = y_pred.mean()
+
+        loss_.append(np.asarray(loss).tolist())
+        loss.backward()
+        optimizer.step()
+
+        param_.append(np.asarray(model.parameters()[9]).tolist())
+        grad_.append(np.asarray(model.parameters()[3]._grad_ivar()).tolist())
+
+        optimizer.clear_grad()
+    return loss_, param_, grad_
+
+
+class TestPyLayer(unittest.TestCase):
+    def test_fc_net_with_dropout(self):
+        def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
+            self.assertEqual(loss_ref, loss)
+            self.assertEqual(param_ref, param)
+            self.assertEqual(grad_ref, grad)
+
+        cuda_state = paddle.get_cuda_rng_state()
+        # without recompute
+        loss_ref, param_ref, grad_ref = run_model(
+            cuda_state, recompute_block=[])
+
+        # recompute second block
+        loss, param, grad = run_model(cuda_state, recompute_block=[1, 3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute fourth block
+        loss, param, grad = run_model(cuda_state, recompute_block=[3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute second to fourth block
+        loss, param, grad = run_model(cuda_state, recompute_block=[1, 2, 3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute second & fourth block
+        loss, param, grad = run_model(cuda_state, recompute_block=[1, 3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+    def test_recompute_kwargs(self):
+        paddle.set_device("gpu")
+        kwargs = {"is_test": False}
+        with self.assertRaises(ValueError):
+            loss_ref, param_ref, grad_ref = run_model(
+                None, recompute_block=[2], recompute_kwargs=kwargs)
+
+    def test_recompute_cpu_rng(self):
+        paddle.set_device("cpu")
+        with self.assertRaises(RuntimeError):
+            loss_ref, param_ref, grad_ref = run_model(None, recompute_block=[2])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_spectral_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_spectral_norm.py
new file mode 100644
index 00000000000000..ef220ba1016173
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_spectral_norm.py
@@ -0,0 +1,139 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import collections
+import paddle
+import paddle.nn as nn
+from paddle.nn.utils import spectral_norm
+
+
+class TestDygraphSpectralNorm(unittest.TestCase):
+    def setUp(self):
+        self.init_test_case()
+        self.set_data()
+
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 12, 12]], )
+        self.n_power_iterations = 1
+        self.eps = 1e-12
+        self.dim = None
+
+    def set_data(self):
+        self.data = collections.OrderedDict()
+        for desc in self.data_desc:
+            data_name = desc[0]
+            data_shape = desc[1]
+            data_value = np.random.random(
+                size=[self.batch_size] + data_shape).astype('float32')
+            self.data[data_name] = data_value
+
+    def spectral_normalize(self, weight, u, v, dim, power_iters, eps):
+        shape = weight.shape
+        weight_mat = weight.copy()
+        h = shape[dim]
+        w = np.prod(shape) // h
+        if dim != 0:
+            perm = [dim] + [d for d in range(len(shape)) if d != dim]
+            weight_mat = weight_mat.transpose(perm)
+        weight_mat = weight_mat.reshape((h, w))
+
+        u = u.reshape((h, 1))
+        v = v.reshape((w, 1))
+        for i in range(power_iters):
+            v = np.matmul(weight_mat.T, u)
+            v_norm = np.sqrt((v * v).sum())
+            v = v / (v_norm + eps)
+            u = np.matmul(weight_mat, v)
+            u_norm = np.sqrt((u * u).sum())
+            u = u / (u_norm + eps)
+        sigma = (u * np.matmul(weight_mat, v)).sum()
+        return weight / sigma
+
+    def test_check_output(self):
+        linear = paddle.nn.Conv2D(2, 1, 3)
+        before_weight = linear.weight.numpy().copy()
+        if self.dim == None:
+            if isinstance(linear, (nn.Conv1DTranspose, nn.Conv2DTranspose,
+                                   nn.Conv3DTranspose, nn.Linear)):
+                self.dim = 1
+            else:
+                self.dim = 0
+        else:
+            self.dim = (self.dim + len(before_weight)) % len(before_weight)
+
+        sn = spectral_norm(
+            linear,
+            n_power_iterations=self.n_power_iterations,
+            eps=self.eps,
+            dim=self.dim)
+        u = sn.weight_u.numpy().copy()
+        v = sn.weight_v.numpy().copy()
+        outputs = []
+        for name, data in self.data.items():
+            output = linear(paddle.to_tensor(data))
+            outputs.append(output.numpy())
+        self.actual_outputs = linear.weight.numpy()
+
+        expect_output = self.spectral_normalize(
+            before_weight, u, v, self.dim, self.n_power_iterations, self.eps)
+
+        for expect, actual in zip(expect_output, self.actual_outputs):
+            self.assertTrue(
+                np.allclose(
+                    np.array(actual), np.array(expect), atol=0.001))
+
+
+class TestDygraphWeightNormCase(TestDygraphSpectralNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.n_power_iterations = 1
+        self.eps = 1e-12
+        self.dim = None
+
+
+class TestDygraphWeightNormWithIterations(TestDygraphSpectralNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.n_power_iterations = 2
+        self.eps = 1e-12
+        self.dim = None
+
+
+class TestDygraphWeightNormWithDim(TestDygraphSpectralNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.n_power_iterations = 1
+        self.eps = 1e-12
+        self.dim = 1
+
+
+class TestDygraphWeightNormWithEps(TestDygraphSpectralNorm):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.data_desc = (['x', [2, 3, 3]], )
+        self.n_power_iterations = 1
+        self.eps = 1e-10
+        self.dim = None
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
index cc362005f33119..d067a2bd577880 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -204,7 +204,7 @@ def init_input_output(self):
 
 class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp):
     def init_input_output(self):
-        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
+        self.x = np.random.rand(2, 10, 12, 1).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
         self.out = self.x + self.y.reshape(1, 10, 12, 1)
 
@@ -224,7 +224,7 @@ def init_axis(self):
 
 class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp):
     def init_input_output(self):
-        self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
+        self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype)
         self.y = np.random.rand(100, 1).astype(self.dtype)
         self.out = self.x + self.y.reshape(100, 1, 1, 1)
 
@@ -234,7 +234,7 @@ def init_axis(self):
 
 class TestFP16ElementwiseAddOp_broadcast_4(TestFP16ElementwiseAddOp):
     def init_input_output(self):
-        self.x = np.random.rand(100, 2, 3, 4).astype(self.dtype)
+        self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype)
         self.y = np.random.rand(100, 1).astype(self.dtype)
         self.out = self.x + self.y.reshape(100, 1, 1, 1)
 
@@ -353,7 +353,7 @@ def init_axis(self):
 
 class TestElementwiseFP16AddOp_commonuse_add1(TestFP16ElementwiseAddOp):
     def init_input_output(self):
-        self.x = np.random.rand(20, 30, 100).astype(self.dtype)
+        self.x = np.random.rand(2, 3, 100).astype(self.dtype)
         self.y = np.random.rand(1, 1, 100).astype(self.dtype)
         self.out = self.x + self.y
 
@@ -374,7 +374,7 @@ def init_axis(self):
 class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(10, 12).astype(self.dtype)
-        self.y = np.random.rand(2, 3, 10, 12).astype(self.dtype)
+        self.y = np.random.rand(2, 2, 10, 12).astype(self.dtype)
         self.out = self.x + self.y
 
     def init_axis(self):
@@ -384,7 +384,7 @@ def init_axis(self):
 class TestElementwiseAddOp_same_shape_ysize_large(TestElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(10, 1, 12).astype(self.dtype)
-        self.y = np.random.rand(10, 3, 12).astype(self.dtype)
+        self.y = np.random.rand(10, 2, 12).astype(self.dtype)
         self.out = self.x + self.y
 
     def init_axis(self):
@@ -408,13 +408,16 @@ def test_errors(self):
             self.assertRaises(TypeError, fluid.layers.elementwise_add, x2, y2)
 
 
-class TestAddOp(unittest.TestCase):
+class TestAddApi(unittest.TestCase):
+    def _executed_api(self, x, y, name=None):
+        return paddle.add(x, y, name)
+
     def test_name(self):
         with fluid.program_guard(fluid.Program()):
             x = fluid.data(name="x", shape=[2, 3], dtype="float32")
             y = fluid.data(name='y', shape=[2, 3], dtype='float32')
 
-            y_1 = paddle.add(x, y, name='add_res')
+            y_1 = self._executed_api(x, y, name='add_res')
             self.assertEqual(('add_res' in y_1.name), True)
 
     def test_declarative(self):
@@ -428,7 +431,7 @@ def gen_data():
 
             x = fluid.data(name="x", shape=[3], dtype='float32')
             y = fluid.data(name="y", shape=[3], dtype='float32')
-            z = paddle.add(x, y)
+            z = self._executed_api(x, y)
 
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -442,12 +445,75 @@ def test_dygraph(self):
             np_y = np.array([1, 5, 2]).astype('float64')
             x = fluid.dygraph.to_variable(np_x)
             y = fluid.dygraph.to_variable(np_y)
-            z = paddle.add(x, y)
+            z = self._executed_api(x, y)
             np_z = z.numpy()
             z_expected = np.array([3., 8., 6.])
             self.assertEqual((np_z == z_expected).all(), True)
 
 
+class TestAddInplaceApi(TestAddApi):
+    def _executed_api(self, x, y, name=None):
+        return x.add_(y, name)
+
+
+class TestAddInplaceBroadcastSuccess(unittest.TestCase):
+    def init_data(self):
+        self.x_numpy = np.random.rand(2, 3, 4).astype('float')
+        self.y_numpy = np.random.rand(3, 4).astype('float')
+
+    def test_broadcast_success(self):
+        paddle.disable_static()
+        self.init_data()
+        x = paddle.to_tensor(self.x_numpy)
+        y = paddle.to_tensor(self.y_numpy)
+        inplace_result = x.add_(y)
+        numpy_result = self.x_numpy + self.y_numpy
+        self.assertEqual((inplace_result.numpy() == numpy_result).all(), True)
+        paddle.enable_static()
+
+
+class TestAddInplaceBroadcastSuccess2(TestAddInplaceBroadcastSuccess):
+    def init_data(self):
+        self.x_numpy = np.random.rand(1, 2, 3, 1).astype('float')
+        self.y_numpy = np.random.rand(3, 1).astype('float')
+
+
+class TestAddInplaceBroadcastSuccess3(TestAddInplaceBroadcastSuccess):
+    def init_data(self):
+        self.x_numpy = np.random.rand(2, 3, 1, 5).astype('float')
+        self.y_numpy = np.random.rand(1, 3, 1, 5).astype('float')
+
+
+class TestAddInplaceBroadcastError(unittest.TestCase):
+    def init_data(self):
+        self.x_numpy = np.random.rand(3, 4).astype('float')
+        self.y_numpy = np.random.rand(2, 3, 4).astype('float')
+
+    def test_broadcast_errors(self):
+        paddle.disable_static()
+        self.init_data()
+        x = paddle.to_tensor(self.x_numpy)
+        y = paddle.to_tensor(self.y_numpy)
+
+        def broadcast_shape_error():
+            x.add_(y)
+
+        self.assertRaises(ValueError, broadcast_shape_error)
+        paddle.enable_static()
+
+
+class TestAddInplaceBroadcastError2(TestAddInplaceBroadcastError):
+    def init_data(self):
+        self.x_numpy = np.random.rand(2, 1, 4).astype('float')
+        self.y_numpy = np.random.rand(2, 3, 4).astype('float')
+
+
+class TestAddInplaceBroadcastError3(TestAddInplaceBroadcastError):
+    def init_data(self):
+        self.x_numpy = np.random.rand(5, 2, 1, 4).astype('float')
+        self.y_numpy = np.random.rand(2, 3, 4).astype('float')
+
+
 class TestComplexElementwiseAddOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_add"
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
index c5372d5b758a8b..2594c96eebd69f 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
@@ -16,6 +16,7 @@
 import unittest
 import numpy as np
 import paddle
+import paddle.fluid as fluid
 from op_test import OpTest, skip_check_grad_ci
 
 
@@ -237,6 +238,111 @@ def init_grad_input_output(self):
         self.grad_y = -self.grad_out
 
 
+class TestSubtractApi(unittest.TestCase):
+    def _executed_api(self, x, y, name=None):
+        return paddle.subtract(x, y, name)
+
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data(name="x", shape=[2, 3], dtype="float32")
+            y = fluid.data(name='y', shape=[2, 3], dtype='float32')
+
+            y_1 = self._executed_api(x, y, name='subtract_res')
+            self.assertEqual(('subtract_res' in y_1.name), True)
+
+    def test_declarative(self):
+        with fluid.program_guard(fluid.Program()):
+
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = self._executed_api(x, y)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(), fetch_list=[z.name])
+            z_expected = np.array([1., -2., 2.])
+            self.assertEqual((z_value == z_expected).all(), True)
+
+    def test_dygraph(self):
+        with fluid.dygraph.guard():
+            np_x = np.array([2, 3, 4]).astype('float64')
+            np_y = np.array([1, 5, 2]).astype('float64')
+            x = fluid.dygraph.to_variable(np_x)
+            y = fluid.dygraph.to_variable(np_y)
+            z = self._executed_api(x, y)
+            np_z = z.numpy()
+            z_expected = np.array([1., -2., 2.])
+            self.assertEqual((np_z == z_expected).all(), True)
+
+
+class TestSubtractInplaceApi(TestSubtractApi):
+    def _executed_api(self, x, y, name=None):
+        return x.subtract_(y, name)
+
+
+class TestSubtractInplaceBroadcastSuccess(unittest.TestCase):
+    def init_data(self):
+        self.x_numpy = np.random.rand(2, 3, 4).astype('float')
+        self.y_numpy = np.random.rand(3, 4).astype('float')
+
+    def test_broadcast_success(self):
+        paddle.disable_static()
+        self.init_data()
+        x = paddle.to_tensor(self.x_numpy)
+        y = paddle.to_tensor(self.y_numpy)
+        inplace_result = x.subtract_(y)
+        numpy_result = self.x_numpy - self.y_numpy
+        self.assertEqual((inplace_result.numpy() == numpy_result).all(), True)
+        paddle.enable_static()
+
+
+class TestSubtractInplaceBroadcastSuccess2(TestSubtractInplaceBroadcastSuccess):
+    def init_data(self):
+        self.x_numpy = np.random.rand(1, 2, 3, 1).astype('float')
+        self.y_numpy = np.random.rand(3, 1).astype('float')
+
+
+class TestSubtractInplaceBroadcastSuccess3(TestSubtractInplaceBroadcastSuccess):
+    def init_data(self):
+        self.x_numpy = np.random.rand(2, 3, 1, 5).astype('float')
+        self.y_numpy = np.random.rand(1, 3, 1, 5).astype('float')
+
+
+class TestSubtractInplaceBroadcastError(unittest.TestCase):
+    def init_data(self):
+        self.x_numpy = np.random.rand(3, 4).astype('float')
+        self.y_numpy = np.random.rand(2, 3, 4).astype('float')
+
+    def test_broadcast_errors(self):
+        paddle.disable_static()
+        self.init_data()
+        x = paddle.to_tensor(self.x_numpy)
+        y = paddle.to_tensor(self.y_numpy)
+
+        def broadcast_shape_error():
+            x.subtract_(y)
+
+        self.assertRaises(ValueError, broadcast_shape_error)
+        paddle.enable_static()
+
+
+class TestSubtractInplaceBroadcastError2(TestSubtractInplaceBroadcastError):
+    def init_data(self):
+        self.x_numpy = np.random.rand(2, 1, 4).astype('float')
+        self.y_numpy = np.random.rand(2, 3, 4).astype('float')
+
+
+class TestSubtractInplaceBroadcastError3(TestSubtractInplaceBroadcastError):
+    def init_data(self):
+        self.x_numpy = np.random.rand(5, 2, 1, 4).astype('float')
+        self.y_numpy = np.random.rand(2, 3, 4).astype('float')
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 0dd78ea53c27b2..770b6d3e92e165 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -16,7 +16,7 @@
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.fluid.core as core
@@ -425,5 +425,31 @@ def test_shape_tensor_list_dtype():
             self.assertRaises(TypeError, test_shape_tensor_list_dtype)
 
 
+class TestFillConstantOp_ValueTensorBf16(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value
+        '''
+        self.op_type = "fill_constant"
+        self.init_data()
+
+        self.inputs = {
+            "ShapeTensor": np.array(self.shape).astype("int32"),
+            'ValueTensor':
+            convert_float_to_uint16(np.array([self.value]).astype("float32"))
+        }
+        self.attrs = {'value': self.value, 'dtype': core.VarDesc.VarType.BF16}
+        self.outputs = {'Out': np.full(self.shape, self.value)}
+
+    def init_data(self):
+        self.shape = [123, 92]
+        self.value = 3.0
+        self.dtype = np.uint16
+        self.mkldnn_data_type = "bfloat16"
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
index d6cc6ecffc106b..bc9ff3697717d1 100644
--- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
@@ -182,6 +182,30 @@ def test_InputError():
         self.assertRaises(ValueError, test_InputError)
 
 
+class TestStaticFlattenPythonAPI(unittest.TestCase):
+    def execute_api(self, x, start_axis=0, stop_axis=-1):
+        return paddle.flatten(x, start_axis, stop_axis)
+
+    def test_static_api(self):
+        paddle.enable_static()
+        np_x = np.random.rand(2, 3, 4, 4).astype('float32')
+
+        main_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, paddle.static.Program()):
+            x = paddle.static.data(
+                name="x", shape=[2, 3, 4, 4], dtype='float32')
+            out = self.execute_api(x, start_axis=-2, stop_axis=-1)
+
+        exe = paddle.static.Executor(place=paddle.CPUPlace())
+        fetch_out = exe.run(main_prog, feed={"x": np_x}, fetch_list=[out])
+        self.assertTrue((2, 3, 16) == fetch_out[0].shape)
+
+
+class TestStaticInplaceFlattenPythonAPI(TestStaticFlattenPythonAPI):
+    def execute_api(self, x, start_axis=0, stop_axis=-1):
+        return x.flatten_(start_axis, stop_axis)
+
+
 class TestFlattenPython(unittest.TestCase):
     def test_python_api(self):
         image_shape = (2, 3, 4, 4)
@@ -204,5 +228,23 @@ def test_Negative():
         self.assertTrue((2, 3, 16) == res_shape)
 
 
+class TestDygraphInplaceFlattenPython(unittest.TestCase):
+    def test_python_api(self):
+        image_shape = (2, 3, 4, 4)
+        x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] *
+                      image_shape[3]).reshape(image_shape) / 100.
+        x = x.astype('float32')
+
+        def test_Negative():
+            paddle.disable_static()
+            img = paddle.to_tensor(x)
+            out = img.flatten_(start_axis=-2, stop_axis=-1)
+            return out.numpy().shape
+
+        res_shape = test_Negative()
+        self.assertTrue((2, 3, 16) == res_shape)
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py b/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
index 869ca41a1923da..6930a330a7c315 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_amp_init.py
@@ -70,6 +70,8 @@ def test_fleet_amp_init(self):
             optimizer = fleet.distributed_optimizer(optimizer)
             optimizer.minimize(cost)
 
+        loss_scale = optimizer.get_loss_scaling()
+
         place = paddle.CUDAPlace(0)
 
         exe = paddle.static.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index 4d1e936558abf7..be5e87b9d344bb 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -524,13 +524,14 @@ def test_sharding_with_pp(self):
         # check program
         startup_prog_op_types = [op.type for op in startup_prog_ops]
         main_prog_op_types = [op.type for op in main_prog_ops]
+        print(startup_prog_op_types)
         self.assertEqual(startup_prog_op_types, [
+            'fill_constant', 'uniform_random', 'fill_constant',
+            'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
-            'fill_constant', 'fill_constant', 'fill_constant', 'uniform_random',
-            'fill_constant', 'uniform_random', 'fill_constant', 'c_gen_nccl_id',
-            'c_comm_init', 'fill_constant', 'c_allreduce_sum', 'c_gen_nccl_id',
-            'c_comm_init', 'fill_constant', 'c_allreduce_sum', 'c_gen_nccl_id',
-            'c_comm_init', 'c_gen_nccl_id', 'c_comm_init'
+            'c_gen_nccl_id', 'c_comm_init', 'fill_constant', 'c_allreduce_sum',
+            'c_gen_nccl_id', 'c_comm_init', 'fill_constant', 'c_allreduce_sum',
+            'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init'
         ])
 
         self.assertEqual(main_prog_op_types, [
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_utils.py b/python/paddle/fluid/tests/unittests/test_fleet_utils.py
index 51c12375948f59..09de4867ef9f46 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_utils.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_utils.py
@@ -24,7 +24,6 @@
 from paddle.dataset.common import download, DATA_HOME
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.utils.fleet_barrier_util import check_all_trainers_ready
 from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
 import paddle.fluid.incubate.fleet.utils.utils as utils
 
@@ -50,15 +49,6 @@ def test_fleet_util_init(self):
         fleet_util_transpiler = FleetUtil(mode="transpiler")
         self.assertRaises(Exception, FleetUtil, "other")
 
-    def test_fleet_barrier(self):
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.WORKER,
-            worker_num=1,
-            server_endpoints=['127.0.0.1'])
-        fleet.init(role)
-        check_all_trainers_ready("/ready_path/", 0)
-
     def test_program_type_trans(self):
         data_dir = self.download_files()
         program_dir = os.path.join(data_dir, self.pruned_dir)
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
index 1e25b8034da0a3..c241fc65d9b82a 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
@@ -18,8 +18,8 @@
 import numpy as np
 import math
 from op_test import OpTest
-from test_gru_op import gru
-from test_fusion_lstm_op import fc, ACTIVATION
+from paddle.fluid.tests.unittests.test_gru_op import gru
+from paddle.fluid.tests.unittests.test_fusion_lstm_op import fc, ACTIVATION
 
 
 def fusion_gru(
diff --git a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
index 3928b6fa034efd..4899927a7694f4 100644
--- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
@@ -17,7 +17,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
-from test_lstm_op import lstm, ACTIVATION
+from paddle.fluid.tests.unittests.test_lstm_op import lstm, ACTIVATION
 
 
 def fc(x, w, b):
diff --git a/python/paddle/fluid/tests/unittests/test_glu.py b/python/paddle/fluid/tests/unittests/test_glu.py
index 63818d8ac50f2a..25f1975db0c529 100644
--- a/python/paddle/fluid/tests/unittests/test_glu.py
+++ b/python/paddle/fluid/tests/unittests/test_glu.py
@@ -17,6 +17,9 @@
 import paddle.fluid.dygraph as dg
 import unittest
 
+import paddle
+from paddle.nn import functional as F
+
 
 def sigmoid(x):
     return 1.0 / (1.0 + np.exp(-x))
@@ -48,5 +51,25 @@ def test_case(self):
             self.check_identity(fluid.CUDAPlace(0))
 
 
+class TestGLUV2(unittest.TestCase):
+    def setUp(self):
+        self.x = np.random.randn(5, 20)
+        self.dim = -1
+        self.out = glu(self.x, self.dim)
+
+    def check_identity(self, place):
+        with dg.guard(place):
+            x_var = paddle.to_tensor(self.x)
+            y_var = F.glu(x_var, self.dim)
+            y_np = y_var.numpy()
+
+        np.testing.assert_allclose(y_np, self.out)
+
+    def test_case(self):
+        self.check_identity(fluid.CPUPlace())
+        if fluid.is_compiled_with_cuda():
+            self.check_identity(fluid.CUDAPlace(0))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
index 2b51bec9cb0e7d..e528e742a277a0 100644
--- a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
+++ b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
@@ -34,6 +34,7 @@ def test_import_paddle(self):
             with open(test_file, 'w') as wb:
                 cmd_test = """
 import paddle
+paddle.utils.run_check()
 x = paddle.rand([3,4])
 assert x.place.is_gpu_place() is False, "There is no CUDA device, but Tensor's place is CUDAPlace"
 """
@@ -52,7 +53,7 @@ def test_import_paddle(self):
             assert 'CPU device will be used by default' in str(
                 stderr
             ), "GPU version Paddle is installed. But CPU device can't be used when CUDA device is not set properly"
-            assert "Error" not in str(
+            assert "AssertionError" not in str(
                 stderr
             ), "There is no CUDA device, but Tensor's place is CUDAPlace"
 
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
index bf2f9518fb0c72..d5056bd11cf084 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -14,6 +14,7 @@
 
 import unittest
 import numpy as np
+import paddle.fluid.core as core
 from op_test import OpTest
 
 
@@ -182,7 +183,7 @@ def initTestCase(self):
         self.align_corners = True
         self.padding_mode = "zeros"
         self.mode = "bilinear"
-        self.use_cudnn = True
+        self.use_cudnn = False if core.is_compiled_with_rocm() else True
 
 
 class Case1(TestGridSamplerOp):
diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py
index 3ea47a5d690ea4..3ec943ef2e04a2 100644
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
@@ -19,7 +19,7 @@
 import math
 import functools
 from op_test import OpTest
-from test_lstm_op import ACTIVATION
+from paddle.fluid.tests.unittests.test_lstm_op import ACTIVATION
 from paddle import fluid
 from paddle.fluid import Program, program_guard
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index ef2900be39c9ac..a56797971b5147 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -106,6 +106,20 @@ def func():
 
         self.assertRaises(ValueError, func)
 
+    def test_amp_guard_upsupported_fp16_op(self):
+        data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+        with fluid.dygraph.guard():
+            conv2d = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
+            data = fluid.dygraph.to_variable(data)
+            with fluid.dygraph.amp_guard(True):
+                out_fp16 = conv2d(data)
+                out_fp32 = paddle.expand_as(
+                    out_fp16, out_fp16)  # expand_as_v2 has no fp16 kernel
+
+        self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
+        self.assertTrue(out_fp16.dtype == fluid.core.VarDesc.VarType.FP16)
+        self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
+
 
 class TestAmpScaler(unittest.TestCase):
     def test_scale(self):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index cb48013902a532..1cdb57c540ac4d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -296,6 +296,28 @@ def test_paddle_imperative_no_grad_guard(self):
             self.assertTrue(tmp._grad_ivar() is None)
             self.assertTrue(l0.weight._grad_ivar() is not None)
 
+    def test_paddle_imperative_set_grad_enabled(self):
+        data = np.array([[2, 3], [4, 5]]).astype('float32')
+        with fluid.dygraph.guard():
+            l0 = fluid.Linear(2, 2)
+            self.assertTrue(l0.weight._grad_ivar() is None)
+            l1 = fluid.Linear(2, 2)
+            with paddle.set_grad_enabled(False):
+                self.assertTrue(l1.weight.stop_gradient is False)
+                tmp = l1.weight * 2
+                with paddle.set_grad_enabled(True):
+                    tmp2 = l1.weight * 2
+                self.assertTrue(tmp.stop_gradient)
+                self.assertTrue(tmp2.stop_gradient is False)
+            x = fluid.dygraph.to_variable(data)
+            y = l0(x) + tmp2
+            o = l1(y)
+            o.backward()
+
+            self.assertTrue(tmp._grad_ivar() is None)
+            self.assertTrue(tmp2._grad_ivar() is not None)
+            self.assertTrue(l0.weight._grad_ivar() is not None)
+
     def test_sum_op(self):
         x = np.ones([2, 2], np.float32)
         with fluid.dygraph.guard():
@@ -472,7 +494,7 @@ def test_mlp(self):
         self.assertEqual("linear_1.b_0", params[3].name)
         self.assertEqual(len(params), 4)
 
-        sublayers = mlp.sublayers(True)
+        sublayers = mlp.sublayers()
         self.assertEqual(mlp._linear1, sublayers[0])
         self.assertEqual(mlp._linear2, sublayers[1])
         self.assertEqual(len(sublayers), 2)
@@ -484,15 +506,15 @@ def test_single_api(sort_sum_gradient):
             for i in range(10):
                 y = paddle.pow(x, 4.0)
                 y.backward()
-                self.assertEqual(x.grad, (i + 1) * 500)
+                self.assertEqual(x.grad.numpy(), (i + 1) * 500)
             x.clear_gradient()
-            self.assertEqual(x.grad, 0.)
+            self.assertEqual(x.grad.numpy(), 0.)
             for i in range(10):
                 y = paddle.pow(x, 4.0)
                 y.backward()
-                self.assertEqual(x.grad, (i + 1) * 500)
+                self.assertEqual(x.grad.numpy(), (i + 1) * 500)
             x.clear_grad()
-            self.assertEqual(x.grad, 0.)
+            self.assertEqual(x.grad.numpy(), 0.)
 
         def test_simple_net(sort_sum_gradient):
             fluid.set_flags({'FLAGS_sort_sum_gradient': sort_sum_gradient})
@@ -505,9 +527,9 @@ def fun(x, y, z):
                 loss2 = x * z
                 loss1.backward(retain_graph=True)
                 loss2.backward(retain_graph=True)
-                self.assertTrue(np.array_equal(x.grad, [23.]))
-                self.assertTrue(np.array_equal(y.grad, [25.]))
-                self.assertTrue(np.array_equal(z.grad, [5.]))
+                self.assertTrue(np.array_equal(x.grad.numpy(), [23.]))
+                self.assertTrue(np.array_equal(y.grad.numpy(), [25.]))
+                self.assertTrue(np.array_equal(z.grad.numpy(), [5.]))
                 x.clear_grad()
                 y.clear_grad()
                 z.clear_grad()
@@ -520,13 +542,13 @@ def fun(x, y, z):
             loss = fun(x, y, z)
             loss.backward(retain_graph=True)
             # x.grad = 2*x*y + z + 2*y = 27 
-            self.assertTrue(np.array_equal(x.grad, [27]))
+            self.assertTrue(np.array_equal(x.grad.numpy(), [27]))
 
             loss.backward(retain_graph=True)
-            self.assertTrue(np.array_equal(x.grad, [54]))
+            self.assertTrue(np.array_equal(x.grad.numpy(), [54]))
 
             loss.backward()
-            self.assertTrue(np.array_equal(x.grad, [81]))
+            self.assertTrue(np.array_equal(x.grad.numpy(), [81]))
 
             with self.assertRaises(RuntimeError):
                 loss.backward()
@@ -536,8 +558,8 @@ def fun(x, y, z):
             dx = paddle.grad([loss1], x, create_graph=True)[0]
             loss = loss1 + loss2 + dx
             loss.backward()
-            self.assertTrue(np.array_equal(dx.grad, [1]))
-            self.assertTrue(np.array_equal(x.grad, [108]))
+            self.assertTrue(np.array_equal(dx.grad.numpy(), [1]))
+            self.assertTrue(np.array_equal(x.grad.numpy(), [108]))
 
         def test_mlp(sort_sum_gradient):
             fluid.set_flags({'FLAGS_sort_sum_gradient': sort_sum_gradient})
@@ -557,28 +579,34 @@ def test_mlp(sort_sum_gradient):
                 detach_x = x.detach()
                 clear_loss = mlp2(detach_x)
                 clear_loss.backward()
-                expected_weight1_grad = expected_weight1_grad + mlp2._linear1.weight.grad
-                expected_bias1_grad = expected_bias1_grad + mlp2._linear1.bias.grad
-                expected_weight2_grad = expected_weight2_grad + mlp2._linear2.weight.grad
-                expected_bias2_grad = expected_bias2_grad + mlp2._linear2.bias.grad
+                expected_weight1_grad = (
+                    expected_weight1_grad + mlp2._linear1.weight.grad.numpy())
+                expected_bias1_grad = (
+                    expected_bias1_grad + mlp2._linear1.bias.grad.numpy())
+                expected_weight2_grad = (
+                    expected_weight2_grad + mlp2._linear2.weight.grad.numpy())
+                expected_bias2_grad = (
+                    expected_bias2_grad + mlp2._linear2.bias.grad.numpy())
 
                 loss = mlp1(x)
                 loss.backward()
 
-                self.assertTrue(np.array_equal(loss.grad, [1]))
+                self.assertTrue(np.array_equal(loss.grad.numpy(), [1]))
                 self.assertTrue(
-                    np.allclose(mlp1._linear1.weight.grad,
+                    np.allclose(mlp1._linear1.weight.grad.numpy(),
                                 expected_weight1_grad))
                 self.assertTrue(
-                    np.allclose(mlp1._linear1.bias.grad, expected_bias1_grad))
+                    np.allclose(mlp1._linear1.bias.grad.numpy(),
+                                expected_bias1_grad))
                 self.assertTrue(
-                    np.allclose(mlp1._linear2.weight.grad,
+                    np.allclose(mlp1._linear2.weight.grad.numpy(),
                                 expected_weight2_grad))
                 self.assertTrue(
-                    np.allclose(mlp1._linear2.bias.grad, expected_bias2_grad))
+                    np.allclose(mlp1._linear2.bias.grad.numpy(),
+                                expected_bias2_grad))
 
                 mlp2.clear_gradients()
-                self.assertTrue(np.array_equal(clear_loss.grad, [1]))
+                self.assertTrue(np.array_equal(clear_loss.grad.numpy(), [1]))
                 if ((batch_id + 1) % 10) == 0:
                     mlp1.clear_gradients()
                     expected_weight1_grad = 0.
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py b/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py
index 846c84c8a58b5c..972f1b64e14071 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_container_sequential.py
@@ -55,6 +55,41 @@ def test_sequential(self):
             loss2 = fluid.layers.reduce_mean(res2)
             loss2.backward()
 
+    def test_sequential_list_params(self):
+        data = np.random.uniform(-1, 1, [5, 10]).astype('float32')
+        with fluid.dygraph.guard():
+            data = fluid.dygraph.to_variable(data)
+            model1 = fluid.dygraph.Sequential(
+                fluid.Linear(10, 1), fluid.Linear(1, 2))
+            res1 = model1(data)
+            self.assertListEqual(res1.shape, [5, 2])
+            model1[1] = fluid.Linear(1, 3)
+            res1 = model1(data)
+            self.assertListEqual(res1.shape, [5, 3])
+            loss1 = fluid.layers.reduce_mean(res1)
+            loss1.backward()
+
+            l1 = fluid.Linear(10, 1)
+            l2 = fluid.Linear(1, 3)
+            model2 = fluid.dygraph.Sequential(['l1', l1], ['l2', l2])
+            self.assertEqual(len(model2), 2)
+            res2 = model2(data)
+            self.assertTrue(l1 is model2.l1)
+            self.assertListEqual(res2.shape, res1.shape)
+            self.assertEqual(len(model1.parameters()), len(model2.parameters()))
+            del model2['l2']
+            self.assertEqual(len(model2), 1)
+            res2 = model2(data)
+            self.assertListEqual(res2.shape, [5, 1])
+            model2.add_sublayer('l3', fluid.Linear(1, 3))
+            model2.add_sublayer('l4', fluid.Linear(3, 4))
+            self.assertEqual(len(model2), 3)
+            res2 = model2(data)
+            self.assertListEqual(res2.shape, [5, 4])
+
+            loss2 = fluid.layers.reduce_mean(res2)
+            loss2.backward()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layers.py b/python/paddle/fluid/tests/unittests/test_imperative_layers.py
index 214339c50d60dd..dc15566f85475c 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layers.py
@@ -210,7 +210,8 @@ def test_layer_str(self):
         module = nn.BatchNorm1D(1)
         self.assertEqual(
             str(module),
-            'BatchNorm1D(num_features=1, momentum=0.9, epsilon=1e-05)')
+            'BatchNorm1D(num_features=1, momentum=0.9, epsilon=1e-05, data_format=NCL)'
+        )
 
         module = nn.BatchNorm2D(1)
         self.assertEqual(
@@ -220,7 +221,8 @@ def test_layer_str(self):
         module = nn.BatchNorm3D(1)
         self.assertEqual(
             str(module),
-            'BatchNorm3D(num_features=1, momentum=0.9, epsilon=1e-05)')
+            'BatchNorm3D(num_features=1, momentum=0.9, epsilon=1e-05, data_format=NCDHW)'
+        )
 
         module = nn.SyncBatchNorm(2)
         self.assertEqual(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index e7af249cf8bc42..64f1715fc975f7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -76,7 +76,10 @@ def forward(self, input, label):
 class TestDygraphSimpleNet(unittest.TestCase):
     def test_simple_net(self):
         for is_sparse in [True, False]:
-            for dtype in ["float32", "float64"]:
+            dtype_list = ["float32"]
+            if not core.is_compiled_with_rocm():
+                dtype_list.append("float64")
+            for dtype in dtype_list:
                 self.simple_net_float32(is_sparse, dtype)
 
     def simple_net_float32(self, is_sparse, dtype):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
index 721453c5124219..dfcd6392b46fb8 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py
@@ -52,10 +52,6 @@ def test_named_sublayers(self):
                                                            list_sublayers):
                 self.assertEqual(sublayer, expected_sublayer)
 
-            for name, sublayer in model.named_sublayers(
-                    include_sublayers=False):
-                self.assertEqual(model[name], sublayer)
-
             self.assertListEqual(
                 [l for _, l in list(model.named_sublayers(include_self=True))],
                 [model] + expected_sublayers)
@@ -71,7 +67,7 @@ def test_named_parameters(self):
 
             named_parameters = list(model.named_parameters())
             expected_named_parameters = list()
-            for prefix, layer in model.named_sublayers(include_sublayers=True):
+            for prefix, layer in model.named_sublayers():
                 for name, param in layer.named_parameters(
                         include_sublayers=False):
                     full_name = prefix + ('.' if prefix else '') + name
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
index 2f2a3e5de5ef9f..8b2e61f8d2a04a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
@@ -82,7 +82,10 @@ def forward(self, input, label):
 class TestDygraphSimpleNet(unittest.TestCase):
     def test_simple_net(self):
         for is_sparse in [True, False]:
-            for dtype in ["float32", "float64"]:
+            dtype_list = ["float32"]
+            if not core.is_compiled_with_rocm():
+                dtype_list.append("float64")
+            for dtype in dtype_list:
                 self.simple_net_float(is_sparse, dtype)
 
     def simple_net_float(self, is_sparse, dtype):
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 952265e1195f5d..8ddb74989714ca 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -29,7 +29,7 @@
 def check_cast_op(op):
     return op.type == 'cast' and \
            op.attr('in_dtype') == VarDesc.VarType.FP32 and \
-           op.attr('out_dtype') == VarDesc.VarType.FP16
+           op.attr('out_dtype') in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]
 
 
 def output_hist(out):
@@ -87,6 +87,13 @@ def test_constant_initializer_fp16(self):
         block = self.test_constant_initializer("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_constant_initializer_bf16(self):
+        """Test constant initializer with bfloat16
+           No cast operator has been added here
+        """
+        self.test_constant_initializer_default_value("uint16")
+        self.test_constant_initializer("uint16")
+
 
 class TestUniformInitializer(unittest.TestCase):
     def test_uniform_initializer_default_value(self, dtype="float32"):
@@ -130,9 +137,9 @@ def test_uniform_initializer_random_seed(self):
                 name="param2",
                 initializer=initializer.UniformInitializer(seed=456))
         init_op = block.ops[1]
-        self.assertEqual(init_op.attr("seed"), 123)
+        self.assertEqual(init_op.attr("seed"), 456)
         init_op1 = block.ops[0]
-        self.assertEqual(init_op1.attr("seed"), 456)
+        self.assertEqual(init_op1.attr("seed"), 123)
 
     def test_uniform_initializer(self, dtype="float32"):
         """Test uniform initializer with supplied attributes
@@ -186,6 +193,14 @@ def test_uniform_initializer_fp16(self):
         block = self.test_uniform_initializer_two_op("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_uniform_initializer_bf16(self):
+        """Test uniform initializer with bfloat16
+           No cast operator has been added here
+        """
+        block = self.test_uniform_initializer_default_value("uint16")
+        block = self.test_uniform_initializer(dtype="uint16")
+        block = self.test_uniform_initializer_two_op("uint16")
+
 
 class TestNormalInitializer(unittest.TestCase):
     def test_normal_initializer_default_value(self):
@@ -219,7 +234,7 @@ def test_normal_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.NormalInitializer(2.3, 1.9, 123))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -234,6 +249,12 @@ def test_normal_initializer_fp16(self):
         block = self.test_normal_initializer("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_normal_initializer_bf16(self):
+        """Test normal initializer with bfloat16
+        """
+        block = self.test_normal_initializer("uint16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+
 
 class TestXavierInitializer(unittest.TestCase):
     def test_uniform_xavier_initializer(self):
@@ -324,7 +345,9 @@ def test_normal_xavier_initializer_conv(self):
         self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
         self.assertEqual(init_op.attr('seed'), 0)
 
-    def test_xavier_initializer_supplied_arguments(self, dtype="float32"):
+    def test_xavier_initializer_supplied_arguments(self,
+                                                   dtype="float32",
+                                                   uniform=True):
         """Test the Xavier initializer with supplied arguments
         """
         program = framework.Program()
@@ -336,14 +359,18 @@ def test_xavier_initializer_supplied_arguments(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.XavierInitializer(
-                    fan_in=12, fan_out=23, seed=134))
-        num_ops = 2 if dtype == "float16" else 1
+                    uniform=uniform, fan_in=12, fan_out=23, seed=134))
+        num_ops = 2 if (dtype == "float16" or (dtype == "uint16" and
+                                               not uniform)) else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
-        self.assertEqual(init_op.type, 'uniform_random')
-        limit = np.sqrt(6.0 / (12 + 23))
-        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
-        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        if uniform:
+            self.assertEqual(init_op.type, 'uniform_random')
+            limit = np.sqrt(6.0 / (12 + 23))
+            self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+            self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        else:
+            self.assertEqual(init_op.type, 'gaussian_random')
         self.assertEqual(init_op.attr('seed'), 134)
         return block
 
@@ -353,6 +380,16 @@ def test_xavier_initializer_fp16(self):
         block = self.test_xavier_initializer_supplied_arguments("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_xavier_initializer_bf16(self):
+        """Test the Xavier initializer with bfloat16
+        """
+        block_uniform = self.test_xavier_initializer_supplied_arguments(
+            "uint16")
+        self.assertEqual(len(block_uniform.ops), 1)
+        block_gaussian = self.test_xavier_initializer_supplied_arguments(
+            "uint16", False)
+        self.assertTrue(check_cast_op(block_gaussian.ops[1]))
+
 
 class TestMSRAInitializer(unittest.TestCase):
     def test_uniform_msra_initializer(self):
@@ -470,6 +507,11 @@ def test_msra_initializer_fp16(self):
         block = self.test_msra_initializer_supplied_arguments("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_msra_initializer_bf16(self):
+        """Test the MSRA initializer with bfloat16
+        """
+        block = self.test_msra_initializer_supplied_arguments("uint16")
+
 
 class TestBilinearInitializer(unittest.TestCase):
     def test_bilinear_initializer(self, dtype="float32"):
@@ -484,7 +526,7 @@ def test_bilinear_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.BilinearInitializer())
-        num_ops = 2 if dtype == "float16" or dtype == "float64" else 1
+        num_ops = 2 if dtype in ["float16", "uint16", "float64"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
@@ -499,6 +541,12 @@ def test_bilinear_initializer_fp16(self):
         block = self.test_bilinear_initializer("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_bilinear_initializer_bf16(self):
+        """Test the bilinear initializer with supplied arguments
+        """
+        block = self.test_bilinear_initializer("uint16")
+        self.assertTrue(check_cast_op(block.ops[1]))
+
     def test_type_error(self):
         self.assertRaises(TypeError, self.test_bilinear_initializer, 'int32')
 
@@ -518,7 +566,7 @@ def test_numpy_array_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.NumpyArrayInitializer(np_array))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
@@ -531,6 +579,12 @@ def test_numpy_array_initializer_fp16(self):
         block = self.test_numpy_array_initializer("float16")
         self.assertTrue(block.ops[1])
 
+    def test_numpy_array_initializer_bf16(self):
+        """Test the numpy array initializer with bfloat16
+        """
+        block = self.test_numpy_array_initializer("uint16")
+        self.assertTrue(block.ops[1])
+
 
 class TestSetGlobalInitializer(unittest.TestCase):
     def test_set_global_weight_initilizer(self):
@@ -547,12 +601,12 @@ def test_set_global_weight_initilizer(self):
         block = startup_prog.global_block()
         self.assertEqual(len(block.ops), 2)
 
-        # init bias is the first op, and weight is the second
-        bias_init_op = block.ops[0]
+        # init weight is the first op, and bias is the second
+        bias_init_op = block.ops[1]
         self.assertEqual(bias_init_op.type, 'fill_constant')
         self.assertAlmostEqual(bias_init_op.attr('value'), 0.0, delta=DELTA)
 
-        param_init_op = block.ops[1]
+        param_init_op = block.ops[0]
         self.assertEqual(param_init_op.type, 'uniform_random')
         self.assertAlmostEqual(param_init_op.attr('min'), -0.5, delta=DELTA)
         self.assertAlmostEqual(param_init_op.attr('max'), 0.5, delta=DELTA)
@@ -577,14 +631,14 @@ def test_set_global_bias_initilizer(self):
         block = startup_prog.global_block()
         self.assertEqual(len(block.ops), 2)
 
-        # init bias is the first op, and weight is the second
-        bias_init_op = block.ops[0]
+        # init weight is the first op, and bias is the second
+        bias_init_op = block.ops[1]
         self.assertEqual(bias_init_op.type, 'gaussian_random')
         self.assertAlmostEqual(bias_init_op.attr('mean'), 0.0, delta=DELTA)
         self.assertAlmostEqual(bias_init_op.attr('std'), 2.0, delta=DELTA)
         self.assertEqual(bias_init_op.attr('seed'), 0)
 
-        param_init_op = block.ops[1]
+        param_init_op = block.ops[0]
         self.assertEqual(param_init_op.type, 'uniform_random')
         self.assertAlmostEqual(param_init_op.attr('min'), -0.5, delta=DELTA)
         self.assertAlmostEqual(param_init_op.attr('max'), 0.5, delta=DELTA)
@@ -618,5 +672,49 @@ def test_uniform_initializer(self, dtype="float32"):
         paddle.enable_static()
 
 
+class TesetconsistencyOfDynamicAndStaticGraph(unittest.TestCase):
+    def test_order(self):
+        paddle.set_device('cpu')
+        SEED = 123
+        weight_attr = paddle.framework.ParamAttr(
+            name="linear_weight",
+            learning_rate=1.0,
+            trainable=False,
+            regularizer=None,
+            initializer=paddle.nn.initializer.TruncatedNormal(
+                mean=0.0, std=2.0))
+        bias_attr = paddle.framework.ParamAttr(
+            name="linear_bias",
+            learning_rate=1.0,
+            trainable=False,
+            regularizer=None,
+            initializer=paddle.nn.initializer.TruncatedNormal(
+                mean=0.0, std=2.0))
+
+        def run_dynamic_graph():
+            paddle.disable_static()
+            paddle.seed(SEED)
+            linear = paddle.nn.Linear(
+                1, 1, weight_attr=weight_attr, bias_attr=bias_attr)
+            return linear.weight.numpy(), linear.bias.numpy()
+            paddle.enable_static()
+
+        def run_static_graph():
+            paddle.enable_static()
+            exe = paddle.static.Executor(paddle.CPUPlace())
+            paddle.seed(SEED)
+            linear = paddle.nn.Linear(
+                1, 1, weight_attr=weight_attr, bias_attr=bias_attr)
+            res = exe.run(paddle.static.default_startup_program(),
+                          fetch_list=['linear_weight', 'linear_bias'])
+            return res[0], res[1]
+
+        dynamic_res = run_dynamic_graph()
+        static_res = run_static_graph()
+
+        self.assertTrue(np.array_equal(dynamic_res[0], static_res[0]))
+        self.assertTrue(np.array_equal(dynamic_res[1], static_res[1]))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_initializer_nn.py b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
index ce72b5effbc514..85815c5eeef30d 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer_nn.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
@@ -36,7 +36,7 @@ def get_uniform_min_and_max(weight):
 def check_cast_op(op):
     return op.type == 'cast' and \
            op.attr('in_dtype') == VarDesc.VarType.FP32 and \
-           op.attr('out_dtype') == VarDesc.VarType.FP16
+           op.attr('out_dtype') in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]
 
 
 class TestConstantInitializer(unittest.TestCase):
@@ -54,7 +54,7 @@ def static_test_constant_initializer_common(self,
                 lod_level=0,
                 name="param",
                 initializer=init_inst)
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'fill_constant')
@@ -109,6 +109,13 @@ def test_constant_initializer_fp16(self):
         self.test_constant_initializer_default_value_dygraph("float16")
         self.test_constant_initializer_dygraph("float16")
 
+    def test_constant_initializer_bf16(self):
+        """Test constant initializer with bfloat16
+            No cast operator has been added here
+        """
+        self.test_constant_initializer_default_value_static("uint16")  #bfloat16
+        self.test_constant_initializer_static("uint16")  #bfloat16
+
 
 class TestKaimingInitializer(unittest.TestCase):
     def static_test_kaiming_initializer_common(self,
@@ -332,6 +339,13 @@ def test_uniform_initializer_fp16(self):
         block = self.test_uniform_initializer_two_op("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_uniform_initializer_bf16(self):
+        """Test uniform initializer with bfloat16
+        """
+        block = self.test_uniform_initializer_default_value("uint16")  #bfloat16
+        block = self.test_uniform_initializer(dtype="uint16")  #bfloat16
+        block = self.test_uniform_initializer_two_op("uint16")  #bfloat16
+
     def test_uniform_initializer_dygraph(self):
         """Test uniform initializer in dygraph model.
         """
@@ -388,7 +402,7 @@ def test_normal_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.Normal(2.3, 1.9))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -405,6 +419,12 @@ def test_normal_initializer_fp16(self):
         block = self.test_normal_initializer("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_normal_initializer_bf16(self):
+        """Test normal initializer with bfloat16
+        """
+        block = self.test_normal_initializer("uint16")  #bfloat16
+        self.assertTrue(check_cast_op(block.ops[1]))
+
     def test_normal_initializer_dygraph(self):
         """Test normal initializer in dygraph model.
         """
@@ -455,7 +475,7 @@ def test_truncated_normal_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.TruncatedNormal(2.3, 1.9))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'truncated_gaussian_random')
@@ -474,6 +494,14 @@ def test_truncated_normal_initializer_fp16(self):
         block = self.test_truncated_normal_initializer("float16")
         self.assertTrue(check_cast_op(block.ops[1]))
 
+    def test_truncated_normal_initializer_bf16(self):
+        """Test truncated normal initializer with bfloat16
+        """
+        paddle.enable_static()
+
+        block = self.test_truncated_normal_initializer("uint16")  #bfloat16
+        self.assertTrue(check_cast_op(block.ops[1]))
+
     def test_truncated_normal_initializer_dygraph(self):
         """Test truncated normal initializer in dygraph model.
         """
@@ -629,7 +657,7 @@ def test_assign_initializer(self, dtype="float32"):
                 lod_level=0,
                 name="param",
                 initializer=initializer.Assign(np_array))
-        num_ops = 2 if dtype == "float16" else 1
+        num_ops = 2 if dtype in ["float16", "uint16"] else 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'assign_value')
@@ -645,6 +673,12 @@ def test_assign_initializer_fp16(self):
         block = self.test_assign_initializer("float16")
         self.assertTrue(block.ops[1])
 
+    def test_assign_initializer_bf16(self):
+        """Test the numpy array initializer with bfloat16
+        """
+        block = self.test_assign_initializer("uint16")  #bfloat16
+        self.assertTrue(block.ops[1])
+
     def test_assign_initializer_dygraph_1(self):
         """Test assign initializer in dygraph model.
         """
@@ -681,6 +715,18 @@ def test_assign_initializer_dygraph_3(self):
 
         self.assertTrue((linear_3.weight.numpy() == [2.0, 2.0]).all(), '')
 
+    def test_assign_initializer_dygraph_4(self):
+        """Test assign initializer in dygraph model.
+        """
+        paddle.disable_static()
+
+        weight_attr_4 = paddle.framework.ParamAttr(
+            name="linear_weight_4",
+            initializer=paddle.nn.initializer.Assign((2, 2)))
+        linear_4 = paddle.nn.Linear(2, 2, weight_attr=weight_attr_4)
+
+        self.assertTrue((linear_4.weight.numpy() == [2.0, 2.0]).all(), '')
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inplace.py b/python/paddle/fluid/tests/unittests/test_inplace.py
index 2c6507c486e872..3d158763527e71 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace.py
@@ -98,11 +98,15 @@ def test_backward_success_2(self):
 class TestDygraphInplace(unittest.TestCase):
     def setUp(self):
         self.init_data()
+        self.set_np_compare_func()
 
     def init_data(self):
-        self.input_var_numpy = np.random.rand(2, 3, 1)
+        self.input_var_numpy = np.random.uniform(-5, 5, [10, 20, 1])
         self.dtype = "float32"
 
+    def set_np_compare_func(self):
+        self.np_compare = np.array_equal
+
     def non_inplace_api_processing(self, var):
         return paddle.squeeze(var)
 
@@ -177,7 +181,7 @@ def test_backward_success_1(self):
             var_d = var_c**2
             loss = var_d.sum()
             loss.backward()
-            grad_var_a_inplace = var_a.grad
+            grad_var_a_inplace = var_a.grad.numpy()
 
         with paddle.fluid.dygraph.guard():
             var_a = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
@@ -188,9 +192,9 @@ def test_backward_success_1(self):
             var_d = var_c**2
             loss = var_d.sum()
             loss.backward()
-            grad_var_a = var_a.grad
+            grad_var_a = var_a.grad.numpy()
 
-        self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a))
+        self.assertTrue(self.np_compare(grad_var_a_inplace, grad_var_a))
 
     def test_backward_success_2(self):
         # Although var_b is modified inplace after using it, it does not used in gradient computation.
@@ -209,7 +213,7 @@ def test_backward_success_2(self):
             loss = var_d.sum()
 
             loss.backward()
-            grad_var_a_inplace = var_a.grad
+            grad_var_a_inplace = var_a.grad.numpy()
 
         with paddle.fluid.dygraph.guard():
             var_a = paddle.to_tensor(self.input_var_numpy).astype(self.dtype)
@@ -224,7 +228,7 @@ def test_backward_success_2(self):
             loss = var_d.sum()
 
             loss.backward()
-            grad_var_a = var_a.grad
+            grad_var_a = var_a.grad.numpy()
         self.assertTrue(np.array_equal(grad_var_a_inplace, grad_var_a))
 
 
@@ -244,6 +248,14 @@ def inplace_api_processing(self, var):
         return paddle.reshape_(var, [-1])
 
 
+class TestDygraphInplaceFlatten(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.flatten()
+
+    def inplace_api_processing(self, var):
+        return var.flatten_()
+
+
 class TestDygraphInplaceScatter(TestDygraphInplace):
     def init_data(self):
         self.input_var_numpy = np.array([[1, 1], [2, 2], [3, 3]])
@@ -296,5 +308,106 @@ def inplace_api_processing(self, var):
         return paddle.tanh_(var)
 
 
+class TestDygraphInplaceCeil(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.ceil()
+
+    def inplace_api_processing(self, var):
+        return var.ceil_()
+
+
+class TestDygraphInplaceFloor(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.floor()
+
+    def inplace_api_processing(self, var):
+        return var.floor_()
+
+
+class TestDygraphInplaceExp(TestDygraphInplace):
+    def set_np_compare_func(self):
+        self.np_compare = np.allclose
+
+    def non_inplace_api_processing(self, var):
+        return var.exp()
+
+    def inplace_api_processing(self, var):
+        return var.exp_()
+
+
+class TestDygraphInplaceReciprocal(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.reciprocal()
+
+    def inplace_api_processing(self, var):
+        return var.reciprocal_()
+
+
+class TestDygraphInplaceRound(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.round()
+
+    def inplace_api_processing(self, var):
+        return var.round_()
+
+
+class TestDygraphInplaceSqrt(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.random.uniform(0, 5, [10, 20, 1])
+        self.dtype = "float32"
+
+    def non_inplace_api_processing(self, var):
+        return var.sqrt()
+
+    def inplace_api_processing(self, var):
+        return var.sqrt_()
+
+
+class TestDygraphInplaceRsqrt(TestDygraphInplaceSqrt):
+    def non_inplace_api_processing(self, var):
+        return var.rsqrt()
+
+    def inplace_api_processing(self, var):
+        return var.rsqrt_()
+
+
+class TestDygraphInplaceClip(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.clip(0.6, 1.5)
+
+    def inplace_api_processing(self, var):
+        return var.clip_(0.6, 1.5)
+
+
+class TestDygraphInplaceScale(TestDygraphInplace):
+    def non_inplace_api_processing(self, var):
+        return var.scale(scale=2.0, bias=3.0)
+
+    def inplace_api_processing(self, var):
+        return var.scale_(scale=2.0, bias=3.0)
+
+
+class TestDygraphInplaceAdd(TestDygraphInplace):
+    def init_data(self):
+        self.input_var_numpy = np.random.rand(2, 3, 4)
+        self.dtype = "float32"
+        input_var_numpy_2 = np.random.rand(2, 3, 4).astype(self.dtype)
+        self.input_var_2 = paddle.to_tensor(input_var_numpy_2)
+
+    def non_inplace_api_processing(self, var):
+        return var.add(self.input_var_2)
+
+    def inplace_api_processing(self, var):
+        return var.add_(self.input_var_2)
+
+
+class TestDygraphInplaceSubtract(TestDygraphInplaceAdd):
+    def non_inplace_api_processing(self, var):
+        return var.subtract(self.input_var_2)
+
+    def inplace_api_processing(self, var):
+        return var.subtract_(self.input_var_2)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
index 7b92f6f02c6db2..077496200d988f 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
@@ -27,7 +27,7 @@
 
 class TestInplaceANBOpTraining(unittest.TestCase):
     def setUp(self):
-        self.dtype = np.float64
+        self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.N = 4
         self.C = 5
         self.H = 7
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py b/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py
new file mode 100644
index 00000000000000..abc8849b614f73
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_inplace_auto_generated_apis.py
@@ -0,0 +1,281 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.static import Program, program_guard
+
+
+# In static mode, inplace strategy will not be used in Inplace APIs.
+class TestStaticAutoGeneratedAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.init_data()
+        self.set_np_compare_func()
+
+    def init_data(self):
+        self.dtype = 'float32'
+        self.shape = [10, 20]
+        self.np_x = np.random.uniform(-5, 5, self.shape).astype(self.dtype)
+
+    def set_np_compare_func(self):
+        self.np_compare = np.array_equal
+
+    def executed_paddle_api(self, x):
+        return x.ceil()
+
+    def executed_numpy_api(self, x):
+        return np.ceil(x)
+
+    def test_api(self):
+        main_prog = Program()
+        with program_guard(main_prog, Program()):
+            x = paddle.static.data(name="x", shape=self.shape, dtype=self.dtype)
+            out = self.executed_paddle_api(x)
+
+        exe = paddle.static.Executor(place=paddle.CPUPlace())
+        fetch_x, fetch_out = exe.run(main_prog,
+                                     feed={"x": self.np_x},
+                                     fetch_list=[x, out])
+
+        self.assertTrue(np.array_equal(fetch_x, self.np_x))
+        self.assertTrue(
+            self.np_compare(fetch_out, self.executed_numpy_api(self.np_x)))
+
+
+class TestStaticInplaceAutoGeneratedAPI(TestStaticAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.ceil_()
+
+
+class TestStaticFloorAPI(TestStaticAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.floor()
+
+    def executed_numpy_api(self, x):
+        return np.floor(x)
+
+
+class TestStaticInplaceFloorAPI(TestStaticFloorAPI):
+    def executed_paddle_api(self, x):
+        return x.floor_()
+
+
+class TestStaticExpAPI(TestStaticAutoGeneratedAPI):
+    def set_np_compare_func(self):
+        self.np_compare = np.allclose
+
+    def executed_paddle_api(self, x):
+        return x.exp()
+
+    def executed_numpy_api(self, x):
+        return np.exp(x)
+
+
+class TestStaticInplaceExpAPI(TestStaticExpAPI):
+    def executed_paddle_api(self, x):
+        return x.exp_()
+
+
+class TestStaticReciprocalAPI(TestStaticAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.reciprocal()
+
+    def executed_numpy_api(self, x):
+        return np.reciprocal(x)
+
+
+class TestStaticInplaceReciprocalAPI(TestStaticReciprocalAPI):
+    def executed_paddle_api(self, x):
+        return x.reciprocal_()
+
+
+class TestStaticRoundAPI(TestStaticAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.round()
+
+    def executed_numpy_api(self, x):
+        return np.round(x)
+
+
+class TestStaticInplaceRoundAPI(TestStaticRoundAPI):
+    def executed_paddle_api(self, x):
+        return x.round_()
+
+
+class TestStaticSqrtAPI(TestStaticAutoGeneratedAPI):
+    def init_data(self):
+        self.dtype = 'float32'
+        self.shape = [10, 20]
+        self.np_x = np.random.uniform(0, 5, self.shape).astype(self.dtype)
+
+    def set_np_compare_func(self):
+        self.np_compare = np.allclose
+
+    def executed_paddle_api(self, x):
+        return x.sqrt()
+
+    def executed_numpy_api(self, x):
+        return np.sqrt(x)
+
+
+class TestStaticInplaceSqrtAPI(TestStaticSqrtAPI):
+    def executed_paddle_api(self, x):
+        return x.sqrt_()
+
+
+class TestStaticRsqrtAPI(TestStaticSqrtAPI):
+    def executed_paddle_api(self, x):
+        return x.rsqrt()
+
+    def executed_numpy_api(self, x):
+        return 1 / np.sqrt(x)
+
+
+class TestStaticInplaceRsqrtAPI(TestStaticRsqrtAPI):
+    def executed_paddle_api(self, x):
+        return x.rsqrt_()
+
+
+# In dygraph mode, inplace strategy will be used in Inplace APIs.
+class TestDygraphAutoGeneratedAPI(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.init_data()
+        self.set_np_compare_func()
+
+    def init_data(self):
+        self.dtype = 'float32'
+        self.shape = [10, 20]
+        self.np_x = np.random.uniform(-5, 5, self.shape).astype(self.dtype)
+
+    def set_np_compare_func(self):
+        self.np_compare = np.array_equal
+
+    def executed_paddle_api(self, x):
+        return x.ceil()
+
+    def executed_numpy_api(self, x):
+        return np.ceil(x)
+
+    def test_api(self):
+        x = paddle.to_tensor(self.np_x, dtype=self.dtype)
+        out = self.executed_paddle_api(x)
+
+        self.assertTrue(
+            self.np_compare(out.numpy(), self.executed_numpy_api(self.np_x)))
+
+
+class TestDygraphInplaceAutoGeneratedAPI(TestDygraphAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.ceil_()
+
+
+class TestDygraphFloorAPI(TestDygraphAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.floor()
+
+    def executed_numpy_api(self, x):
+        return np.floor(x)
+
+
+class TestDygraphInplaceFloorAPI(TestDygraphFloorAPI):
+    def executed_paddle_api(self, x):
+        return x.floor_()
+
+
+class TestDygraphExpAPI(TestDygraphAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.exp()
+
+    def executed_numpy_api(self, x):
+        return np.exp(x)
+
+    def set_np_compare_func(self):
+        self.np_compare = np.allclose
+
+
+class TestDygraphInplaceExpAPI(TestDygraphExpAPI):
+    def executed_paddle_api(self, x):
+        return x.exp_()
+
+
+class TestDygraphReciprocalAPI(TestDygraphAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.reciprocal()
+
+    def executed_numpy_api(self, x):
+        return np.reciprocal(x)
+
+
+class TestDygraphInplaceReciprocalAPI(TestDygraphReciprocalAPI):
+    def executed_paddle_api(self, x):
+        return x.reciprocal_()
+
+
+class TestDygraphRoundAPI(TestDygraphAutoGeneratedAPI):
+    def executed_paddle_api(self, x):
+        return x.round()
+
+    def executed_numpy_api(self, x):
+        return np.round(x)
+
+
+class TestDygraphInplaceRoundAPI(TestDygraphRoundAPI):
+    def executed_paddle_api(self, x):
+        return x.round_()
+
+
+class TestDygraphSqrtAPI(TestDygraphAutoGeneratedAPI):
+    def init_data(self):
+        self.dtype = 'float32'
+        self.shape = [10, 20]
+        self.np_x = np.random.uniform(0, 100, self.shape).astype(self.dtype)
+
+    def set_np_compare_func(self):
+        self.np_compare = np.allclose
+
+    def executed_paddle_api(self, x):
+        return x.sqrt()
+
+    def executed_numpy_api(self, x):
+        return np.sqrt(x)
+
+
+class TestDygraphInplaceSqrtAPI(TestDygraphSqrtAPI):
+    def executed_paddle_api(self, x):
+        return x.sqrt_()
+
+
+class TestDygraphRsqrtAPI(TestDygraphSqrtAPI):
+    def executed_paddle_api(self, x):
+        return x.rsqrt()
+
+    def executed_numpy_api(self, x):
+        return 1. / np.sqrt(x)
+
+
+class TestDygraphInplaceRsqrtAPI(TestDygraphRsqrtAPI):
+    def executed_paddle_api(self, x):
+        return x.rsqrt_()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index bf9912c89cb873..eef38182f6edf6 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -158,6 +158,22 @@ def forward(self, x, y):
         return x_out, y_out, loss
 
 
+class LinearNetMultiInput1(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetMultiInput1, self).__init__()
+        self._linear1 = Linear(in_size, out_size)
+        self._linear2 = Linear(in_size, out_size)
+
+    @declarative(input_spec=(InputSpec(
+        [None, 8], dtype='float32'), InputSpec(
+            [None, 8], dtype='float32')))
+    def forward(self, x, y):
+        x_out = self._linear1(x)
+        y_out = self._linear2(y)
+        loss = fluid.layers.mean(x_out + y_out)
+        return x_out, y_out, loss
+
+
 class MultiLoadingLinearNet(fluid.dygraph.Layer):
     def __init__(self, size, model_path):
         super(MultiLoadingLinearNet, self).__init__()
@@ -383,15 +399,6 @@ def test_load_dygraph_no_path(self):
         with self.assertRaises(ValueError):
             model_dict, _ = fluid.dygraph.load_dygraph(model_path)
 
-    def test_jit_load_model_incomplete(self):
-        model_path = "test_jit_save_load.remove_variables/model"
-        self.train_and_save_model(model_path)
-        # remove `.pdiparams`	
-        var_path = model_path + INFER_PARAMS_SUFFIX
-        os.remove(var_path)
-        with self.assertRaises(ValueError):
-            paddle.jit.load(model_path)
-
     def test_jit_load_no_path(self):
         path = "test_jit_save_load.no_path/model_path"
         with self.assertRaises(ValueError):
@@ -542,6 +549,42 @@ def test_multi_in_out(self):
         # 4. assert pred_x == pred_xx
         self.assertTrue(np.allclose(pred_x.numpy(), pred_xx.numpy()))
 
+    def test_multi_in_out1(self):
+        net = LinearNetMultiInput1(8, 8)
+
+        model_path = "multi_inout1.output_spec1/model"
+        # 1. check inputs and outputs
+        self.assertTrue(len(net.forward.inputs) == 2)
+        input_x = net.forward.inputs[0]
+        input_y = net.forward.inputs[1]
+        self.assertTrue(input_x.shape == (-1, 8))
+        self.assertTrue(input_y.shape == (-1, 8))
+
+        # 2. prune loss
+        output_spec = net.forward.outputs[:2]
+        paddle.jit.save(net, model_path, output_spec=output_spec)
+
+        # 3. load to infer
+        infer_layer = paddle.jit.load(model_path)
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        y = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+        # 4. predict
+        pred_x, pred_y = infer_layer(x, y)
+
+        # 1. prune y and loss
+        model_path = "multi_inout1.output_spec2/model"
+        output_spec = net.forward.outputs[:1]
+        paddle.jit.save(net, model_path, (input_x, ), output_spec=output_spec)
+        # 2. load again
+        infer_layer2 = paddle.jit.load(model_path)
+        # 3. predict
+        pred_xx = infer_layer2(x)
+
+        # 4. assert pred_x == pred_xx
+        self.assertTrue(np.allclose(pred_x.numpy(), pred_xx.numpy()))
+
 
 class TestJitSaveLoadConfig(unittest.TestCase):
     def setUp(self):
@@ -1112,6 +1155,63 @@ def test_save_load_finetune_load(self):
         self.assertTrue(float(((result_01 - result_11)).abs().max()) < 1e-5)
 
 
+class TestJitSaveLoadFunction(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+
+    def test_jit_save_load_static_function(self):
+        @paddle.jit.to_static
+        def fun(inputs):
+            return paddle.tanh(inputs)
+
+        path = 'test_jit_save_load_function_1/func'
+        inps = paddle.rand([3, 6])
+        origin = fun(inps)
+
+        paddle.jit.save(fun, path)
+        load_func = paddle.jit.load(path)
+
+        load_result = load_func(inps)
+        self.assertTrue((load_result - origin).abs().max() < 1e-10)
+
+    def test_jit_save_load_function_input_spec(self):
+        @paddle.jit.to_static(input_spec=[
+            InputSpec(
+                shape=[None, 6], dtype='float32', name='x'),
+        ])
+        def fun(inputs):
+            return paddle.nn.functional.relu(inputs)
+
+        path = 'test_jit_save_load_function_2/func'
+        inps = paddle.rand([3, 6])
+        origin = fun(inps)
+
+        paddle.jit.save(fun, path)
+        load_func = paddle.jit.load(path)
+        load_result = load_func(inps)
+        self.assertTrue((load_result - origin).abs().max() < 1e-10)
+
+    def test_jit_save_load_function_function(self):
+        def fun(inputs):
+            return paddle.tanh(inputs)
+
+        path = 'test_jit_save_load_function_3/func'
+        inps = paddle.rand([3, 6])
+        origin = fun(inps)
+
+        paddle.jit.save(
+            fun,
+            path,
+            input_spec=[
+                InputSpec(
+                    shape=[None, 6], dtype='float32', name='x'),
+            ])
+        load_func = paddle.jit.load(path)
+
+        load_result = load_func(inps)
+        self.assertTrue((load_result - origin).abs().max() < 1e-10)
+
+
 class TestJitSaveLoadDataParallel(unittest.TestCase):
     def verify_inference_correctness(self, layer, path):
         layer.eval()
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
index f324e4bd377c61..77cd6926b563da 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op_v2.py
@@ -82,5 +82,60 @@ def compute_v2(x_np):
             self.assertTrue(np.allclose(y1, y2))
 
 
+class TestLayerNormFunction(unittest.TestCase):
+    def test_dygraph(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            shape = [4, 10, 4, 4]
+
+            def compute_v0(x):
+                with fluid.dygraph.guard(p):
+                    ln = fluid.dygraph.LayerNorm(shape[1:])
+                    y = ln(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v1(x):
+                with fluid.dygraph.guard(p):
+                    x = fluid.dygraph.to_variable(x)
+                    y = paddle.nn.functional.layer_norm(x, shape[1:])
+                return y.numpy()
+
+            def compute_v2(x):
+                with fluid.dygraph.guard(p):
+                    x = fluid.dygraph.to_variable(x)
+                    y = paddle.nn.functional.layer_norm(x, tuple(shape[1:]))
+                return y.numpy()
+
+            def compute_v3(x):
+                with fluid.dygraph.guard(p):
+                    ln = fluid.dygraph.LayerNorm(shape[-1])
+                    y = ln(fluid.dygraph.to_variable(x))
+                return y.numpy()
+
+            def compute_v4(x):
+                with fluid.dygraph.guard(p):
+                    x = fluid.dygraph.to_variable(x)
+                    y = paddle.nn.functional.layer_norm(x, shape[-1])
+                return y.numpy()
+
+            x = np.random.randn(*shape).astype("float32")
+            y0 = compute_v0(x)
+            y1 = compute_v1(x)
+            y2 = compute_v2(x)
+            self.assertTrue(np.allclose(y0, y1))
+            self.assertTrue(np.allclose(y0, y2))
+            y3 = compute_v3(x)
+            y4 = compute_v4(x)
+            self.assertTrue(np.allclose(y3, y4))
+
+            self.assertRaises(
+                ValueError,
+                paddle.nn.functional.layer_norm,
+                x=x,
+                normalized_shape=1.0)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 35ecbd6bf10c30..5da4a1889b6b43 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -3718,6 +3718,36 @@ def test_set_train_eval_in_static_mode(self):
         self.assertFalse(net.training)
 
 
+class MyLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MyLayer, self).__init__()
+        self._linear = paddle.nn.Linear(1, 1)
+        self._dropout = paddle.nn.Dropout(p=0.5)
+
+    def forward(self, input):
+        temp = self._linear(input)
+        temp = self._dropout(temp)
+        return temp
+
+
+class MySuperLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(MySuperLayer, self).__init__()
+        self._mylayer = MyLayer()
+
+    def forward(self, input):
+        temp = self._mylayer(input)
+        return temp
+
+
+class TestSubLayerCount(unittest.TestCase):
+    def test_sublayer(self):
+        with fluid.dygraph.guard():
+            mySuperlayer = MySuperLayer()
+            self.assertTrue(len(mySuperlayer.sublayers()) == 3)
+            self.assertTrue(len(mySuperlayer.sublayers(include_self=True)) == 4)
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lookahead.py b/python/paddle/fluid/tests/unittests/test_lookahead.py
index 98349be93db1a8..a4b5e6d0d9576f 100644
--- a/python/paddle/fluid/tests/unittests/test_lookahead.py
+++ b/python/paddle/fluid/tests/unittests/test_lookahead.py
@@ -110,7 +110,8 @@ def train(layer, loader, loss_fn, opt):
                     out = layer(image)
                     loss = loss_fn(out, label)
                     loss.backward()
-                    fast_param = layer.bias.numpy() - SGD_LR * layer.bias.grad
+                    fast_param = (
+                        layer.bias.numpy() - SGD_LR * layer.bias.grad.numpy())
                     opt.step()
                     if idx == 1:
                         slow_param = fast_param
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
index 13c4aa6d767a66..b423123160f0b5 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py
@@ -171,6 +171,52 @@ def test_check_output(self):
         self.check_output_with_place(core.CPUPlace(), check_dygraph=False)
 
 
+class TestEmbeddingLayerBF16ConstantInitializer(unittest.TestCase):
+    """
+    Test embedding layer api and results for bfloat16
+    """
+
+    def set_initializer(self):
+        self.initializer = fluid.initializer.Constant(value=self.value)
+
+    def setUp(self):
+        self.ids_shape = [4, 1]
+        self.w_shape = [10, 64]
+        self.ids = np.random.randint(
+            low=0, high=9, size=self.ids_shape).astype("int64")
+        self.flat_ids = self.ids.flatten()
+        self.value = 3.0
+        self.w_fp32 = np.full(self.w_shape, self.value)
+        self.place = fluid.CPUPlace()
+        self.prog = fluid.Program()
+        self.startup_prog = fluid.Program()
+        self.set_initializer()
+
+        with fluid.program_guard(self.prog, self.startup_prog):
+            x = fluid.layers.data(name='x', shape=self.ids_shape, dtype='int64')
+            self.emb = fluid.layers.embedding(
+                input=x,
+                size=self.w_shape,
+                param_attr=fluid.ParamAttr(
+                    name="emb_weight", initializer=self.initializer),
+                is_sparse=False,
+                dtype="uint16")  # bfloat16
+        exe = fluid.Executor(self.place)
+        exe.run(self.startup_prog)
+        self.result = exe.run(self.prog,
+                              feed={'x': self.ids},
+                              fetch_list=['emb_weight', self.emb])
+
+    def test_embedding_weights(self):
+        result = convert_uint16_to_float(self.result[0])
+        self.assertTrue(np.array_equal(self.w_fp32, result))
+
+    def test_lookup_results(self):
+        lookup_result = convert_uint16_to_float(self.result[1])
+        lookup_ref = _lookup(self.w_fp32, self.ids, self.flat_ids)
+        self.assertTrue(np.array_equal(lookup_result, lookup_ref))
+
+
 if __name__ == "__main__":
     enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
index 8fd250f2a52c27..f5bccf7ab09b63 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
@@ -101,7 +101,7 @@ def prepare_places(with_data_parallel, with_cpu=False, with_gpu=True):
 
 
 class TestStaticDataLoader(unittest.TestCase):
-    def run_main(self, num_workers, places):
+    def run_main(self, num_workers, places, use_pe=True):
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
             startup_prog, main_prog, image, label, loss = simple_fc_net_static()
@@ -120,10 +120,13 @@ def run_main(self, num_workers, places):
             exe = fluid.Executor(place=places[0])
             exe.run(startup_prog)
 
-            prog = fluid.CompiledProgram(main_prog)
-            if len(places) > 1:
-                prog = prog.with_data_parallel(
-                    loss_name=loss.name, places=places)
+            if use_pe:
+                prog = fluid.CompiledProgram(main_prog)
+                if len(places) > 1:
+                    prog = prog.with_data_parallel(
+                        loss_name=loss.name, places=places)
+            else:
+                prog = main_prog
 
             step_list = []
             loss_list = []
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index ffecec1815b159..31704ebcd91920 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -164,10 +164,10 @@ def test_vanilla_momentum_optimizer(self):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 2)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
         self.assertEqual(init_ops[1].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+        self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), 0.0)
 
     def test_nesterov_momentum_optimizer(self):
         init_program = framework.Program()
@@ -217,10 +217,10 @@ def test_nesterov_momentum_optimizer(self):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 2)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
         self.assertEqual(init_ops[1].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+        self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), 0.0)
 
 
 class TestAdagradOptimizer(unittest.TestCase):
@@ -277,10 +277,10 @@ def test_adagrad_optimizer(self):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 2)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
         self.assertEqual(init_ops[1].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+        self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), 0.0)
 
 
 class TestAdamOptimizer(unittest.TestCase):
@@ -344,8 +344,8 @@ def test_adam_optimizer(self):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 5)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[-1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate)
 
 
 class TestAdamaxOptimizer(unittest.TestCase):
@@ -409,8 +409,8 @@ def test_adamax_optimizer(self):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 4)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[-1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate)
 
 
 class TestDpsgdOptimizer(unittest.TestCase):
@@ -509,10 +509,10 @@ def test_decayed_adagrad_optimizer(self):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 2)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
         self.assertEqual(init_ops[1].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+        self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), 0.0)
 
 
 class TestFtrlOptimizer(unittest.TestCase):
@@ -576,8 +576,8 @@ def test_ftrl_optimizer(self):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 3)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[-1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate)
 
 
 class TestLookaheadOptimizer(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_for_varbase.py b/python/paddle/fluid/tests/unittests/test_optimizer_for_varbase.py
new file mode 100644
index 00000000000000..8fdedce22469a2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_for_varbase.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+
+import paddle
+import paddle.optimizer as optimizer
+
+
+class TestOptimizerForVarBase(unittest.TestCase):
+    def setUp(self):
+        self.lr = 0.01
+
+    def run_optimizer_step_with_varbase_list_input(self, optimizer):
+        x = paddle.zeros([2, 3])
+        y = paddle.ones([2, 3])
+        x.stop_gradient = False
+
+        z = x + y
+
+        opt = optimizer(
+            learning_rate=self.lr, parameters=[x], weight_decay=0.01)
+
+        z.backward()
+        opt.step()
+
+        self.assertTrue(np.allclose(x.numpy(), np.full([2, 3], -self.lr)))
+
+    def run_optimizer_minimize_with_varbase_list_input(self, optimizer):
+        x = paddle.zeros([2, 3])
+        y = paddle.ones([2, 3])
+        x.stop_gradient = False
+
+        z = x + y
+
+        opt = optimizer(learning_rate=self.lr, parameters=[x])
+
+        z.backward()
+        opt.minimize(z)
+
+        self.assertTrue(np.allclose(x.numpy(), np.full([2, 3], -self.lr)))
+
+    def test_adam_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.Adam)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.Adam)
+
+    def test_sgd_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.SGD)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.SGD)
+
+    def test_adagrad_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.Adagrad)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.Adagrad)
+
+    def test_adamw_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.AdamW)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.AdamW)
+
+    def test_adamax_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.Adamax)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.Adamax)
+
+    def test_momentum_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.Momentum)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.Momentum)
+
+    def test_optimizer_with_varbase_input(self):
+        x = paddle.zeros([2, 3])
+        with self.assertRaises(TypeError):
+            optimizer.Adam(learning_rate=self.lr, parameters=x)
+
+    def test_create_param_lr_with_1_for_coverage(self):
+        x = paddle.fluid.framework.ParamBase(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="x",
+            optimize_attr={'learning_rate': 1.0})
+        x.value().get_tensor().set(
+            np.random.random((5, 10)).astype('float32'),
+            paddle.fluid.framework._current_expected_place())
+
+        y = paddle.ones([5, 10])
+        z = x + y
+        opt = optimizer.Adam(learning_rate=self.lr, parameters=[x])
+        z.backward()
+        opt.step()
+
+    def test_create_param_lr_with_no_1_value_for_coverage(self):
+        x = paddle.fluid.framework.ParamBase(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="x",
+            optimize_attr={'learning_rate': 0.12})
+        x.value().get_tensor().set(
+            np.random.random((5, 10)).astype('float32'),
+            paddle.fluid.framework._current_expected_place())
+
+        y = paddle.ones([5, 10])
+        z = x + y
+        opt = optimizer.Adam(learning_rate=self.lr, parameters=[x])
+        z.backward()
+        opt.step()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
index 69298f0f6a55d4..7caae211b7bba5 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_grad.py
@@ -64,7 +64,7 @@ def _calc_gradient(self, cond_i):
 
         return grads
 
-    def build_net(self, cond_i):
+    def build_net(self, cond_i, use_bf16=False):
         """
         pseudo code:
             sum_xy = x + y
@@ -122,13 +122,22 @@ def cond_false():
         sum_cond = fluid.layers.cond(cond_i > 1.0, cond_true, cond_false)
         sum_all = fluid.layers.sum([sum_xy, sub_yz, sum_cond])
         mean_out = fluid.layers.mean(sum_all)
+        if use_bf16:
+            import paddle.static.amp as amp
+            self.optimizer = amp.bf16.decorate_bf16(
+                self.optimizer,
+                amp_lists=amp.bf16.AutoMixedPrecisionListsBF16(
+                    custom_fp32_list={'elementwise_add'}),
+                use_bf16_guard=False,
+                use_pure_bf16=True)
+
         self.optimizer.minimize(mean_out)
 
         fetch_list = ["param_x", "param_z"] if self.y_no_grad else [
             "param_x", "param_y", "param_z"
         ]
         fetch_list += [_append_grad_suffix_(param) for param in fetch_list]
-        return fetch_list
+        return fetch_list, self.optimizer
 
 
 class TestOptimizer(unittest.TestCase):
@@ -180,7 +189,7 @@ def _init_param_attr(self):
         for key in ['x', 'y', 'z']:
             self.param_attr[key] = self.attr.copy()
 
-    def _check_grads(self):
+    def _check_grads(self, use_bf16=False):
         """
         main logic code to check the validity of apply_optimize.
         """
@@ -204,10 +213,16 @@ def _check_grads(self):
                                 lambda: dict())
                             test_net = self.NetClass(self.optimizer, param_lr,
                                                      y_no_grad)
-                            fetch_list = test_net.build_net(cond_i)
+                            fetch_list, decorated_optimizer = test_net.build_net(
+                                cond_i, use_bf16)
+                            if use_bf16:
+                                self.optimizer = decorated_optimizer
 
                             exe = fluid.Executor(place)
                             exe.run(init_program)
+                            if use_bf16:
+                                self.optimizer.amp_init(exe.place)
+
                             # Train 2 steps to check validity
                             for batch_i in range(2):
 
@@ -222,6 +237,15 @@ def _check_grads(self):
                                                                param_grads[i])
 
 
+@unittest.skipIf(not fluid.core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestSGDOptimizer(TestOptimizer):
+    def test_optimizer_multiblock_except(self):
+        with self.assertRaisesRegexp(ValueError,
+                                     "var param_y not in this block"):
+            self._check_grads(use_bf16=True)
+
+
 class TestAdamOptimizer(TestOptimizer):
     """
     inherit TestOptimizer and shall override two functions as follows:
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index b58d63969a5e5c..3a5c43b2bab3ed 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -27,6 +27,7 @@
 from paddle.fluid.optimizer import Adam
 import paddle.fluid.framework as framework
 from test_imperative_base import new_program_scope
+from paddle.optimizer.lr import LRScheduler
 
 BATCH_SIZE = 16
 BATCH_NUM = 4
@@ -262,8 +263,31 @@ def test_replace_static_save_load(self):
 
     def test_paddle_save_load_v2(self):
         paddle.disable_static()
+
+        class StepDecay(LRScheduler):
+            def __init__(self,
+                         learning_rate,
+                         step_size,
+                         gamma=0.1,
+                         last_epoch=-1,
+                         verbose=False):
+                self.step_size = step_size
+                self.gamma = gamma
+                super(StepDecay, self).__init__(learning_rate, last_epoch,
+                                                verbose)
+
+            def get_lr(self):
+                i = self.last_epoch // self.step_size
+                return self.base_lr * (self.gamma**i)
+
         layer = LinearNet()
-        state_dict = layer.state_dict()
+        inps = paddle.randn([2, IMAGE_SIZE])
+        adam = opt.Adam(
+            learning_rate=StepDecay(0.1, 1), parameters=layer.parameters())
+        y = layer(inps)
+        y.mean().backward()
+        adam.step()
+        state_dict = adam.state_dict()
         path = 'paddle_save_load_v2/model.pdparams'
         with self.assertRaises(TypeError):
             paddle.save(state_dict, path, use_binary_format='False')
@@ -274,9 +298,15 @@ def test_paddle_save_load_v2(self):
         paddle.save(state_dict, path)
         load_dict_np = paddle.framework.io._legacy_load(path)
         for k, v in state_dict.items():
-            self.assertTrue(
-                np.array_equal(v.numpy(), load_dict_tensor[k].numpy()))
-            self.assertTrue(np.array_equal(v.numpy(), load_dict_np[k]))
+            if isinstance(v, dict):
+                self.assertTrue(v == load_dict_tensor[k])
+            else:
+                self.assertTrue(
+                    np.array_equal(v.numpy(), load_dict_tensor[k].numpy()))
+                if not np.array_equal(v.numpy(), load_dict_np[k]):
+                    print(v.numpy())
+                    print(load_dict_np[k])
+                self.assertTrue(np.array_equal(v.numpy(), load_dict_np[k]))
 
     def test_single_pickle_var_dygraph(self):
         # enable dygraph mode
@@ -370,6 +400,366 @@ def test_dygraph_save_static_load(self):
                     np.array_equal(tensor.numpy(),
                                    np.array(state_dict_param[tensor.name])))
 
+    def test_save_load_complex_object_dygraph_save(self):
+        paddle.disable_static()
+        layer = paddle.nn.Linear(3, 4)
+        state_dict = layer.state_dict()
+        obj1 = [
+            paddle.randn(
+                [3, 4], dtype='float32'), np.random.randn(5, 6),
+            ('fake_weight', np.ones(
+                [7, 8], dtype='float32'))
+        ]
+        obj2 = {'k1': obj1, 'k2': state_dict, 'epoch': 123}
+        obj3 = (paddle.randn(
+            [5, 4], dtype='float32'), np.ndarray(
+                [3, 4], dtype="float32"), {
+                    "state_dict": state_dict,
+                    "opt": state_dict
+                })
+        obj4 = (np.random.randn(5, 6), (123, ))
+
+        path1 = "test_save_load_any_complex_object_dygraph/obj1"
+        path2 = "test_save_load_any_complex_object_dygraph/obj2"
+        path3 = "test_save_load_any_complex_object_dygraph/obj3"
+        path4 = "test_save_load_any_complex_object_dygraph/obj4"
+        paddle.save(obj1, path1)
+        paddle.save(obj2, path2)
+        paddle.save(obj3, path3)
+        paddle.save(obj4, path4)
+
+        load_tensor1 = paddle.load(path1, return_numpy=False)
+        load_tensor2 = paddle.load(path2, return_numpy=False)
+        load_tensor3 = paddle.load(path3, return_numpy=False)
+        load_tensor4 = paddle.load(path4, return_numpy=False)
+
+        self.assertTrue(
+            np.array_equal(load_tensor1[0].numpy(), obj1[0].numpy()))
+        self.assertTrue(np.array_equal(load_tensor1[1], obj1[1]))
+        self.assertTrue(np.array_equal(load_tensor1[2].numpy(), obj1[2][1]))
+        for i in range(len(load_tensor1)):
+            self.assertTrue(
+                type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(v.numpy(), load_tensor2['k2'][k].numpy()))
+        self.assertTrue(load_tensor2['epoch'] == 123)
+
+        self.assertTrue(
+            np.array_equal(load_tensor3[0].numpy(), obj3[0].numpy()))
+        self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1]))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(load_tensor3[2]["state_dict"][k].numpy(),
+                               v.numpy()))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(load_tensor3[2]["opt"][k].numpy(), v.numpy()))
+
+        self.assertTrue(np.array_equal(load_tensor4[0].numpy(), obj4[0]))
+
+        load_array1 = paddle.load(path1, return_numpy=True)
+        load_array2 = paddle.load(path2, return_numpy=True)
+        load_array3 = paddle.load(path3, return_numpy=True)
+        load_array4 = paddle.load(path4, return_numpy=True)
+
+        self.assertTrue(np.array_equal(load_array1[0], obj1[0].numpy()))
+        self.assertTrue(np.array_equal(load_array1[1], obj1[1]))
+        self.assertTrue(np.array_equal(load_array1[2], obj1[2][1]))
+        for i in range(len(load_array1)):
+            self.assertTrue(type(load_array1[i]) == type(load_array2['k1'][i]))
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(v.numpy(), load_array2['k2'][k]))
+        self.assertTrue(load_array2['epoch'] == 123)
+
+        self.assertTrue(np.array_equal(load_array3[0], obj3[0].numpy()))
+        self.assertTrue(np.array_equal(load_array3[1], obj3[1]))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(load_array3[2]["state_dict"][k], v.numpy()))
+
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(load_array3[2]["opt"][k], v.numpy()))
+
+        self.assertTrue(np.array_equal(load_array4[0], obj4[0]))
+
+        # static mode
+        paddle.enable_static()
+
+        load_tensor1 = paddle.load(path1, return_numpy=False)
+        load_tensor2 = paddle.load(path2, return_numpy=False)
+        load_tensor3 = paddle.load(path3, return_numpy=False)
+        load_tensor4 = paddle.load(path4, return_numpy=False)
+
+        self.assertTrue(
+            np.array_equal(np.array(load_tensor1[0]), obj1[0].numpy()))
+        self.assertTrue(np.array_equal(np.array(load_tensor1[1]), obj1[1]))
+        self.assertTrue(np.array_equal(np.array(load_tensor1[2]), obj1[2][1]))
+
+        for i in range(len(load_tensor1)):
+            self.assertTrue(
+                type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(v.numpy(), np.array(load_tensor2['k2'][k])))
+        self.assertTrue(load_tensor2['epoch'] == 123)
+
+        self.assertTrue(
+            isinstance(load_tensor3[0], paddle.fluid.core.LoDTensor))
+        self.assertTrue(
+            np.array_equal(np.array(load_tensor3[0]), obj3[0].numpy()))
+        self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1]))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                isinstance(load_tensor3[2]["state_dict"][k],
+                           paddle.fluid.core.LoDTensor))
+            self.assertTrue(
+                np.array_equal(
+                    np.array(load_tensor3[2]["state_dict"][k]), v.numpy()))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                isinstance(load_tensor3[2]["opt"][k],
+                           paddle.fluid.core.LoDTensor))
+            self.assertTrue(
+                np.array_equal(np.array(load_tensor3[2]["opt"][k]), v.numpy()))
+
+        self.assertTrue(load_tensor4[0], paddle.fluid.core.LoDTensor)
+        self.assertTrue(np.array_equal(np.array(load_tensor4[0]), obj4[0]))
+
+        load_array1 = paddle.load(path1, return_numpy=True)
+        load_array2 = paddle.load(path2, return_numpy=True)
+        load_array3 = paddle.load(path3, return_numpy=True)
+        load_array4 = paddle.load(path4, return_numpy=True)
+
+        self.assertTrue(np.array_equal(load_array1[0], obj1[0].numpy()))
+        self.assertTrue(np.array_equal(load_array1[1], obj1[1]))
+        self.assertTrue(np.array_equal(load_array1[2], obj1[2][1]))
+        for i in range(len(load_array1)):
+            self.assertTrue(type(load_array1[i]) == type(load_array2['k1'][i]))
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(v.numpy(), load_array2['k2'][k]))
+        self.assertTrue(load_array2['epoch'] == 123)
+
+        self.assertTrue(isinstance(load_array3[0], np.ndarray))
+        self.assertTrue(np.array_equal(load_array3[0], obj3[0].numpy()))
+        self.assertTrue(np.array_equal(load_array3[1], obj3[1]))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(load_array3[2]["state_dict"][k], v.numpy()))
+
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(load_array3[2]["opt"][k], v.numpy()))
+
+        self.assertTrue(np.array_equal(load_array4[0], obj4[0]))
+
+    def test_save_load_complex_object_static_save(self):
+        paddle.enable_static()
+        with new_program_scope():
+            # create network
+            x = paddle.static.data(
+                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            z = paddle.static.nn.fc(x, 10, bias_attr=False)
+            z = paddle.static.nn.fc(z, 128, bias_attr=False)
+            loss = fluid.layers.reduce_mean(z)
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            prog = paddle.static.default_main_program()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+
+            state_dict = prog.state_dict()
+            keys = list(state_dict.keys())
+            obj1 = [
+                state_dict[keys[0]], np.random.randn(5, 6),
+                ('fake_weight', np.ones(
+                    [7, 8], dtype='float32'))
+            ]
+            obj2 = {'k1': obj1, 'k2': state_dict, 'epoch': 123}
+            obj3 = (state_dict[keys[0]], np.ndarray(
+                [3, 4], dtype="float32"), {
+                    "state_dict": state_dict,
+                    "opt": state_dict
+                })
+            obj4 = (np.ndarray([3, 4], dtype="float32"), )
+
+            path1 = "test_save_load_any_complex_object_static/obj1"
+            path2 = "test_save_load_any_complex_object_static/obj2"
+            path3 = "test_save_load_any_complex_object_static/obj3"
+            path4 = "test_save_load_any_complex_object_static/obj4"
+            paddle.save(obj1, path1)
+            paddle.save(obj2, path2)
+            paddle.save(obj3, path3)
+            paddle.save(obj4, path4)
+
+            load_tensor1 = paddle.load(path1, return_numpy=False)
+            load_tensor2 = paddle.load(path2, return_numpy=False)
+            load_tensor3 = paddle.load(path3, return_numpy=False)
+            load_tensor4 = paddle.load(path4, return_numpy=False)
+
+            self.assertTrue(
+                np.array_equal(np.array(load_tensor1[0]), np.array(obj1[0])))
+            self.assertTrue(np.array_equal(np.array(load_tensor1[1]), obj1[1]))
+            self.assertTrue(
+                np.array_equal(np.array(load_tensor1[2]), obj1[2][1]))
+            for i in range(len(load_tensor1)):
+                self.assertTrue(
+                    type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(
+                        np.array(v), np.array(load_tensor2['k2'][k])))
+            self.assertTrue(load_tensor2['epoch'] == 123)
+
+            self.assertTrue(isinstance(load_tensor3[0], fluid.core.LoDTensor))
+            self.assertTrue(np.array_equal(np.array(load_tensor3[0]), obj3[0]))
+            self.assertTrue(isinstance(load_tensor3[1], fluid.core.LoDTensor))
+            self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1]))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    isinstance(load_tensor3[2]["state_dict"][k],
+                               fluid.core.LoDTensor))
+                self.assertTrue(
+                    np.array_equal(
+                        np.array(load_tensor3[2]["state_dict"][k]), np.array(
+                            v)))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    isinstance(load_tensor3[2]["opt"][k], fluid.core.LoDTensor))
+                self.assertTrue(
+                    np.array_equal(
+                        np.array(load_tensor3[2]["opt"][k]), np.array(v)))
+
+            self.assertTrue(isinstance(load_tensor4[0], fluid.core.LoDTensor))
+            self.assertTrue(np.array_equal(np.array(load_tensor4[0]), obj4[0]))
+
+            load_array1 = paddle.load(path1, return_numpy=True)
+            load_array2 = paddle.load(path2, return_numpy=True)
+            load_array3 = paddle.load(path3, return_numpy=True)
+            load_array4 = paddle.load(path4, return_numpy=True)
+
+            self.assertTrue(np.array_equal(load_array1[0], np.array(obj1[0])))
+            self.assertTrue(np.array_equal(load_array1[1], obj1[1]))
+            self.assertTrue(np.array_equal(load_array1[2], obj1[2][1]))
+            for i in range(len(load_array1)):
+                self.assertTrue(
+                    type(load_array1[i]) == type(load_array2['k1'][i]))
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(np.array(v), load_array2['k2'][k]))
+            self.assertTrue(load_array2['epoch'] == 123)
+
+            self.assertTrue(np.array_equal(load_array3[0], np.array(obj3[0])))
+            self.assertTrue(np.array_equal(load_array3[1], obj3[1]))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(load_array3[2]["state_dict"][k], np.array(
+                        v)))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(load_array3[2]["opt"][k], np.array(v)))
+
+            self.assertTrue(np.array_equal(load_array4[0], obj4[0]))
+
+            # dygraph mode
+            paddle.disable_static()
+
+            load_tensor1 = paddle.load(path1, return_numpy=False)
+            load_tensor2 = paddle.load(path2, return_numpy=False)
+            load_tensor3 = paddle.load(path3, return_numpy=False)
+            load_tensor4 = paddle.load(path4, return_numpy=False)
+
+            self.assertTrue(
+                np.array_equal(np.array(load_tensor1[0]), np.array(obj1[0])))
+            self.assertTrue(np.array_equal(np.array(load_tensor1[1]), obj1[1]))
+            self.assertTrue(np.array_equal(load_tensor1[2].numpy(), obj1[2][1]))
+            for i in range(len(load_tensor1)):
+                self.assertTrue(
+                    type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(
+                        np.array(v), np.array(load_tensor2['k2'][k])))
+            self.assertTrue(load_tensor2['epoch'] == 123)
+
+            self.assertTrue(isinstance(load_tensor3[0], fluid.core.VarBase))
+            self.assertTrue(np.array_equal(load_tensor3[0].numpy(), obj3[0]))
+            self.assertTrue(isinstance(load_tensor3[1], fluid.core.VarBase))
+            self.assertTrue(np.array_equal(load_tensor3[1].numpy(), obj3[1]))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    isinstance(load_tensor3[2]["state_dict"][k],
+                               fluid.core.VarBase))
+                self.assertTrue(
+                    np.array_equal(load_tensor3[2]["state_dict"][k].numpy(),
+                                   np.array(v)))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    isinstance(load_tensor3[2]["opt"][k], fluid.core.VarBase))
+                self.assertTrue(
+                    np.array_equal(load_tensor3[2]["opt"][k].numpy(),
+                                   np.array(v)))
+
+            self.assertTrue(isinstance(load_tensor4[0], fluid.core.VarBase))
+            self.assertTrue(np.array_equal(load_tensor4[0].numpy(), obj4[0]))
+
+            load_array1 = paddle.load(path1, return_numpy=True)
+            load_array2 = paddle.load(path2, return_numpy=True)
+            load_array3 = paddle.load(path3, return_numpy=True)
+            load_array4 = paddle.load(path4, return_numpy=True)
+
+            self.assertTrue(np.array_equal(load_array1[0], np.array(obj1[0])))
+            self.assertTrue(np.array_equal(load_array1[1], obj1[1]))
+            self.assertTrue(np.array_equal(load_array1[2], obj1[2][1]))
+            for i in range(len(load_array1)):
+                self.assertTrue(
+                    type(load_array1[i]) == type(load_array2['k1'][i]))
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(np.array(v), load_array2['k2'][k]))
+            self.assertTrue(load_array2['epoch'] == 123)
+
+            self.assertTrue(np.array_equal(load_array3[0], np.array(obj3[0])))
+            self.assertTrue(np.array_equal(load_array3[1], obj3[1]))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(load_array3[2]["state_dict"][k], np.array(
+                        v)))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(load_array3[2]["opt"][k], np.array(v)))
+
+            self.assertTrue(isinstance(load_array4[0], np.ndarray))
+            self.assertTrue(np.array_equal(load_array4[0], obj4[0]))
+
+    def test_varbase_binary_var(self):
+        paddle.disable_static()
+        varbase = paddle.randn([3, 2], dtype='float32')
+        path = 'test_paddle_save_load_varbase_binary_var/varbase'
+        paddle.save(varbase, path, use_binary_format=True)
+        load_array = paddle.load(path, return_numpy=True)
+        load_tensor = paddle.load(path, return_numpy=False)
+        origin_array = varbase.numpy()
+        load_tensor_array = load_tensor.numpy()
+        if paddle.fluid.core.is_compiled_with_cuda():
+            fluid.core._cuda_synchronize(paddle.CUDAPlace(0))
+        self.assertTrue(np.array_equal(origin_array, load_array))
+        self.assertTrue(np.array_equal(origin_array, load_tensor_array))
+
 
 class TestSaveLoad(unittest.TestCase):
     def setUp(self):
@@ -431,8 +821,6 @@ def test_save_load(self):
         # error test cases, some tests relay base test above
         # 1. test save obj not dict error
         test_list = [1, 2, 3]
-        with self.assertRaises(NotImplementedError):
-            paddle.save(test_list, "not_dict_error_path")
 
         # 2. test save path format error
         with self.assertRaises(ValueError):
@@ -447,5 +835,57 @@ def test_save_load(self):
             paddle.load("test_paddle_save_load.linear")
 
 
+class TestSaveLoadProgram(unittest.TestCase):
+    def test_save_load_program(self):
+        paddle.enable_static()
+        with new_program_scope():
+            layer = LinearNet()
+            data = paddle.static.data(
+                name='x_static_save', shape=(None, IMAGE_SIZE), dtype='float32')
+            y_static = layer(data)
+            main_program = paddle.static.default_main_program()
+            startup_program = paddle.static.default_startup_program()
+            origin_main = main_program.desc.serialize_to_string()
+            origin_startup = startup_program.desc.serialize_to_string()
+            path1 = "test_paddle_save_load_program/main_program.pdmodel"
+            path2 = "test_paddle_save_load_program/startup_program.pdmodel"
+            paddle.save(main_program, path1)
+            paddle.save(startup_program, path2)
+
+        with new_program_scope():
+            load_main = paddle.load(path1).desc.serialize_to_string()
+            load_startup = paddle.load(path2).desc.serialize_to_string()
+            self.assertTrue(origin_main == load_main)
+            self.assertTrue(origin_startup == load_startup)
+
+
+class TestSaveLoadLayer(unittest.TestCase):
+    def test_save_load_layer(self):
+        if six.PY2:
+            return
+
+        paddle.disable_static()
+        inps = paddle.randn([1, IMAGE_SIZE], dtype='float32')
+        layer1 = LinearNet()
+        layer2 = LinearNet()
+        layer1.eval()
+        layer2.eval()
+        origin = (layer1(inps), layer2(inps))
+        path = "test_save_load_layer_/layer.pdmodel"
+        paddle.save((layer1, layer2), path)
+
+        # static
+        paddle.enable_static()
+        with self.assertRaises(ValueError):
+            paddle.load(path)
+        # dygraph
+        paddle.disable_static()
+
+        loaded_layer = paddle.load(path)
+        loaded_result = [l(inps) for l in loaded_layer]
+        for i in range(len(origin)):
+            self.assertTrue((origin[i] - loaded_result[i]).abs().max() < 1e-10)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
new file mode 100644
index 00000000000000..8b508d5c9ae798
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import os
+import sys
+import six
+
+import paddle
+import paddle.nn as nn
+import paddle.optimizer as opt
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import Adam
+import paddle.fluid.framework as framework
+from test_imperative_base import new_program_scope
+
+IMAGE_SIZE = 784
+
+
+class TestSaveLoadBinaryFormat(unittest.TestCase):
+    def setUp(self):
+        # enable static graph mode
+        paddle.enable_static()
+
+    def set_zero(self, prog, place, scope=None):
+        if scope is None:
+            scope = fluid.global_scope()
+        for var in prog.list_vars():
+            if isinstance(var, framework.Parameter) or var.persistable:
+                ten = scope.find_var(var.name).get_tensor()
+                if ten is not None:
+                    ten.set(np.zeros_like(np.array(ten)), place)
+                    new_t = np.array(scope.find_var(var.name).get_tensor())
+                    self.assertTrue(np.sum(np.abs(new_t)) == 0)
+
+    def replace_save_vars(self, program, dirname):
+        def predicate(var):
+            return var.persistable
+
+        vars = filter(predicate, program.list_vars())
+        for var in vars:
+            paddle.save(
+                var.get_value(),
+                os.path.join(dirname, var.name),
+                use_binary_format=True)
+
+    def replace_load_vars(self, program, dirname):
+        def predicate(var):
+            return var.persistable
+
+        var_list = list(filter(predicate, program.list_vars()))
+        for var in var_list:
+            var_load = paddle.load(os.path.join(dirname, var.name))
+            # set var_load to scope
+            var.set_value(var_load)
+
+    def test_replace_save_load_vars(self):
+        paddle.enable_static()
+        with new_program_scope():
+            # create network
+            x = paddle.static.data(
+                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            z = paddle.static.nn.fc(x, 10, bias_attr=False)
+            z = paddle.static.nn.fc(z, 128, bias_attr=False)
+            loss = fluid.layers.reduce_mean(z)
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            prog = paddle.static.default_main_program()
+            base_map = {}
+            for var in prog.list_vars():
+                if isinstance(var, framework.Parameter) or var.persistable:
+                    t = np.array(fluid.global_scope().find_var(var.name)
+                                 .get_tensor())
+                    # make sure all the paramerter or optimizer var have been update
+                    self.assertTrue(np.sum(np.abs(t)) != 0)
+                    base_map[var.name] = t
+            # test for replace_save_vars/io.load_vars
+            path_vars1 = 'test_replace_save_load_vars_binary1/model'
+            self.replace_save_vars(prog, path_vars1)
+            # set var to zero
+            self.set_zero(prog, place)
+            var_list = list(
+                filter(lambda var: var.persistable, prog.list_vars()))
+            fluid.io.load_vars(
+                exe, path_vars1, main_program=prog, vars=var_list)
+
+            for var in prog.list_vars():
+                if var.persistable:
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+
+                    self.assertTrue(np.array_equal(new_t, base_t))
+            # test for io.save_vars/replace_load_vars
+            path_vars2 = 'test_replace_save_load_vars_binary2/model/'
+            fluid.io.save_vars(
+                exe, path_vars2, main_program=prog, vars=var_list)
+            self.set_zero(prog, place)
+            self.replace_load_vars(prog, path_vars2)
+            for var in prog.list_vars():
+                if var.persistable:
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+
+                    self.assertTrue(np.array_equal(new_t, base_t))
+
+    def test_save_load_lod_tensor(self):
+        paddle.enable_static()
+        OUTPUT_NUM = 32
+        with new_program_scope():
+            x = fluid.data(name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            y = fluid.layers.fc(
+                x,
+                OUTPUT_NUM,
+                name='fc_vars', )
+            prog = fluid.default_main_program()
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            prog = paddle.static.default_main_program()
+            exe.run(fluid.default_startup_program())
+
+            dirname = 'test_save_load_lod_tensor1/tensor_'
+            for var in prog.list_vars():
+                if var.persistable and list(
+                        var.shape) == [IMAGE_SIZE, OUTPUT_NUM]:
+                    tensor = var.get_value()
+                    paddle.save(
+                        tensor, dirname + 'fc_vars.w_0', use_binary_format=True)
+                    break
+
+            origin = np.array(var.get_value())
+            var.set_value(np.zeros_like(origin))
+            is_zeros = np.array(var.get_value())
+
+            loaded_tensor = paddle.load(dirname + 'fc_vars.w_0')
+            self.assertTrue(isinstance(loaded_tensor, fluid.core.LoDTensor))
+            self.assertTrue(
+                list(loaded_tensor.shape()) == [IMAGE_SIZE, OUTPUT_NUM])
+            to_array = np.array(loaded_tensor)
+            self.assertTrue(np.array_equal(origin, to_array))
+
+        with self.assertRaises(NotImplementedError):
+            path = 'test_save_load_error/temp'
+            paddle.save({}, path, use_binary_format=True)
+
+        with self.assertRaises(ValueError):
+            path = 'test_save_load_error/temp'
+            with open(path, "w") as f:
+                f.write('\0')
+            paddle.load(path)
+
+        with self.assertRaises(ValueError):
+            temp_lod = fluid.core.LoDTensor()
+            paddle.save(temp_lod, path, use_binary_format=True)
+
+        with self.assertRaises(RuntimeError):
+            fluid.core._save_lod_tensor(
+                temp_lod, 'test_save_load_error_not_exist_file/not_exist_file')
+
+        with self.assertRaises(RuntimeError):
+            fluid.core._load_lod_tensor(
+                temp_lod, 'test_save_load_error_not_exist_file/not_exist_file')
+
+    def test_save_load_selected_rows(self):
+        paddle.enable_static()
+        place = fluid.CPUPlace() if not paddle.fluid.core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+        height = 10
+        rows = [0, 4, 7]
+        row_numel = 12
+        selected_rows = fluid.core.SelectedRows(rows, height)
+        path = 'test_paddle_save_load_selected_rows/sr.pdsr'
+
+        with self.assertRaises(ValueError):
+            paddle.save(selected_rows, path, use_binary_format=True)
+
+        np_array = np.random.randn(len(rows), row_numel).astype("float32")
+        tensor = selected_rows.get_tensor()
+        tensor.set(np_array, place)
+
+        paddle.save(selected_rows, path, use_binary_format=True)
+        load_sr = paddle.load(path)
+
+        self.assertTrue(isinstance(load_sr, fluid.core.SelectedRows))
+        self.assertTrue(list(load_sr.rows()) == rows)
+        self.assertTrue(load_sr.height() == height)
+        self.assertTrue(
+            np.array_equal(np.array(load_sr.get_tensor()), np_array))
+
+        with self.assertRaises(RuntimeError):
+            fluid.core._save_selected_rows(
+                selected_rows,
+                'test_paddle_save_load_selected_rows_not_exist_file/temp')
+        with self.assertRaises(RuntimeError):
+            fluid.core._load_selected_rows(
+                selected_rows,
+                'test_paddle_save_load_selected_rows_not_exist_file/temp')
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
index 6454b3918ef608..4b9d6764bbb3b6 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
@@ -15,19 +15,24 @@
 from __future__ import print_function
 
 import unittest
-import time
 import paddle.fluid as fluid
 
 from test_parallel_dygraph_dataparallel import TestMultipleGpus
 
 
 class TestHybridParallel(TestMultipleGpus):
-    def test_hybrid_parallel_mp_layers(self):
-        self.run_mnist_2gpu('hybrid_parallel_mp_layers.py')
-
     def test_hybrid_parallel_mp_random(self):
         self.run_mnist_2gpu('hybrid_parallel_mp_random.py')
 
+    def test_hybrid_parallel_mp_model(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_model.py')
+
+    def test_hybrid_parallel_mp_amp(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_amp.py')
+
+    def test_hybrid_parallel_mp_clip_grad(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_clip_grad.py')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py
new file mode 100644
index 00000000000000..e0a2770852b63c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestModelParallelLayer(TestMultipleGpus):
+    def test_hybrid_parallel_mp_layer(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_layers.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py
new file mode 100644
index 00000000000000..f3b89d694f70b9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestHybridPipeParallel(TestMultipleGpus):
+    def test_hybrid_parallel_pp_layer(self):
+        self.run_mnist_2gpu('hybrid_parallel_pp_layer.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index ea59a7f584a2dd..47d286fb6ab329 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -159,7 +159,7 @@ def check_network_convergence(self,
                 train_data = paddle.batch(
                     paddle.reader.shuffle(
                         paddle.dataset.conll05.test(), buf_size=8192),
-                    batch_size=16)
+                    batch_size=8)
 
                 place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
                 exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
new file mode 100644
index 00000000000000..7f8294ad0efe75
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestPipelineParallel(TestMultipleGpus):
+    def test_pipeline_parallel(self):
+        self.run_mnist_2gpu('hybrid_parallel_pp_model.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pylayer_op.py b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
index 89f8330fe5ba4b..e058115d691993 100644
--- a/python/paddle/fluid/tests/unittests/test_pylayer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
@@ -30,7 +30,7 @@ def forward(ctx, x1, x2, func1, func2=paddle.square):
                 y1 = func1(x1)
                 y2 = func1(x2)
                 ctx.save_for_backward(y1, y2)
-                return y1, y2
+                return y1, 1, y2, None
 
             @staticmethod
             def backward(ctx, dy1, dy2):
@@ -44,13 +44,49 @@ def backward(ctx, dy1, dy2):
         input1.stop_gradient = False
         input2.stop_gradient = False
         z = tanh.apply(input1, input1, paddle.tanh, paddle.square)
-        z = z[0] + z[1]
+        z = z[0] + z[2]
         z.mean().backward()
 
         z2 = paddle.tanh(input2) + paddle.tanh(input2)
         z2.mean().backward()
 
-        self.assertTrue(np.max(np.abs((input1.grad - input2.grad))) < 1e-10)
+        self.assertTrue(
+            np.max(np.abs((input1.grad.numpy() - input2.grad.numpy()))) < 1e-10)
+
+    def test_simple_pylayer_return_none_with_no_grad(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2, func1, func2=paddle.square):
+                ctx.func = func2
+                y1 = func1(x1)
+                y2 = func1(x2)
+                ctx.save_for_backward(y1, y2)
+                return 1, None, y1, y2, ''
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                y1, y2 = ctx.saved_tensor()
+                re1 = dy1 * (1 - ctx.func(y1))
+                re2 = dy2 * (1 - paddle.square(y2))
+                return re1, None
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = input1.detach().clone()
+        input3 = input1.detach().clone()
+        input4 = input1.detach().clone()
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+        input3.stop_gradient = True
+        input4.stop_gradient = True
+        z = tanh.apply(input1, input3, paddle.tanh, paddle.square)
+        z = z[2] + z[3]
+        z.mean().backward()
+
+        z2 = paddle.tanh(input2) + paddle.tanh(input4)
+        z2.mean().backward()
+
+        self.assertTrue(
+            np.max(np.abs((input1.grad.numpy() - input2.grad.numpy()))) < 1e-10)
 
     def test_simple_pylayer_single_output(self):
         class tanh(PyLayer):
@@ -76,7 +112,29 @@ def backward(ctx, dy1):
         z2 = paddle.tanh(input2)
         z2.mean().backward()
 
-        self.assertTrue(np.max(np.abs((input1.grad - input2.grad))) < 1e-10)
+        self.assertTrue(
+            np.max(np.abs((input1.grad.numpy() - input2.grad.numpy()))) < 1e-10)
+
+    def test_pylayer_num_output_match(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(
+                    ctx,
+                    x1,
+                    x2, ):
+                return x1 + x2
+
+            @staticmethod
+            def backward(ctx, dy1):
+                return dy1 + 1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = input1.detach().clone()
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+        z = tanh.apply(input1, input2)
+        with self.assertRaises(ValueError):
+            z.mean().backward()
 
     def test_pylayer_dtype(self):
         class tanh(PyLayer):
@@ -113,21 +171,21 @@ def backward(ctx, *args):
                 return args
 
         input1 = paddle.randn([2, 3]).astype("float64")
-        with self.assertRaises(NotImplementedError):
+        with self.assertRaises(ValueError):
             z = Layer_None1.apply(input1)
 
         class Layer_None2(PyLayer):
             @staticmethod
             def forward(ctx, *args):
-                return [None, None]
+                return [None, args[0]]
 
             @staticmethod
             def backward(ctx, *args):
                 return args
 
         input1 = paddle.randn([2, 3]).astype("float64")
-        with self.assertRaises(NotImplementedError):
-            z = Layer_None2.apply(input1)
+        # return None
+        z = Layer_None2.apply(input1)
 
         class Layer_one1(PyLayer):
             @staticmethod
@@ -139,21 +197,22 @@ def backward(ctx, *args):
                 return args
 
         input1 = paddle.randn([2, 3]).astype("float64")
-        with self.assertRaises(NotImplementedError):
+        # At least one output of `PyLayer.backward` is a `Tensor`
+        with self.assertRaises(ValueError):
             z = Layer_one1.apply(input1)
 
         class Layer_one2(PyLayer):
             @staticmethod
             def forward(ctx, *args):
-                return [1, 2]
+                return [1, 2, args[0]]
 
             @staticmethod
             def backward(ctx, *args):
                 return args
 
         input1 = paddle.randn([2, 3]).astype("float64")
-        with self.assertRaises(NotImplementedError):
-            z = Layer_one2.apply(input1)
+        # return int 
+        z = Layer_one2.apply(input1)
 
         class Layer_no_fw(PyLayer):
             @staticmethod
@@ -196,9 +255,8 @@ def backward(ctx, dy1):
         input2.stop_gradient = False
         z = Layer_bk_none1.apply(input2)
 
-        with self.assertRaises(NotImplementedError):
-            with paddle.fluid.dygraph.guard():
-                z.sum().backward()
+        with self.assertRaises(ValueError):
+            z.sum().backward()
 
         class Layer_bk_none2(PyLayer):
             @staticmethod
@@ -212,9 +270,9 @@ def backward(ctx, dy1):
         input1 = paddle.randn([2, 3]).astype("float64")
         input1.stop_gradient = False
         z = Layer_bk_none2.apply(input1, input1)
-        with self.assertRaises(NotImplementedError):
-            with paddle.fluid.dygraph.guard():
-                z.mean().backward()
+
+        with self.assertRaises(ValueError):
+            z.mean().backward()
 
         class Layer_bk_one1(PyLayer):
             @staticmethod
@@ -228,14 +286,14 @@ def backward(ctx, dy):
         input1 = paddle.randn([2, 3]).astype("float64")
         input1.stop_gradient = False
         z = Layer_bk_one1.apply(input1)
-        with self.assertRaises(NotImplementedError):
-            with paddle.fluid.dygraph.guard():
-                z.mean().backward()
+
+        with self.assertRaises(ValueError):
+            z.mean().backward()
 
         class Layer_bk_one2(PyLayer):
             @staticmethod
-            def forward(ctx, x):
-                return x * 2, x * 5
+            def forward(ctx, x1, x2):
+                return x1 * 2, x2 * 5
 
             @staticmethod
             def backward(ctx, *args):
@@ -243,10 +301,11 @@ def backward(ctx, *args):
 
         input1 = paddle.randn([2, 3]).astype("float64")
         input1.stop_gradient = False
-        z = Layer_bk_one1.apply(input1)
-        with self.assertRaises(NotImplementedError):
-            with paddle.fluid.dygraph.guard():
-                z.mean().backward()
+
+        y = Layer_bk_one2.apply(input1, input1)
+        z = y[0] + y[1]
+        with self.assertRaises(ValueError):
+            z.mean().backward()
 
         class Layer_no_bk(PyLayer):
             @staticmethod
@@ -257,10 +316,9 @@ def forward(ctx, x):
         input1.stop_gradient = False
         z = Layer_no_bk.apply(input1)
 
-        with self.assertRaises(NotImplementedError):
-            with paddle.fluid.dygraph.guard():
-                z = z[0] + z[1]
-                z.mean().backward()
+        with self.assertRaises(OSError):
+            z = z[0] + z[1]
+            z.mean().backward()
 
         class Layer_bk_match(PyLayer):
             @staticmethod
@@ -275,28 +333,98 @@ def backward(ctx, dy1, dy2):
         input1.stop_gradient = False
         z = Layer_bk_match.apply(input1)
         with self.assertRaises(ValueError):
-            with paddle.fluid.dygraph.guard():
-                z = z[0] + z[1]
-                z.mean().backward()
+            z = z[0] + z[1]
+            z.mean().backward()
+
+    def test_pylayer_bk_return_none(self):
+        class Layer_bk_none1(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2):
+                return x1 + x2
+
+            @staticmethod
+            def backward(ctx, dy):
+                return 1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = True
+        input2.stop_gradient = False
+        z = Layer_bk_none1.apply(input1, input2)
+
+        with self.assertRaises(ValueError):
+            z.mean().backward()
+
+        class Layer_bk_none2(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2):
+                return x1 * 2, x2 * 5
+
+            @staticmethod
+            def backward(ctx, *args):
+                return 1, 1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = True
+        input2.stop_gradient = False
+        z = Layer_bk_none2.apply(input1, input2)
+        z = z[0] + z[1]
+        with self.assertRaises(ValueError):
+            z.mean().backward()
 
     def test_pylayer_inplace(self):
         class cus_tanh(PyLayer):
             @staticmethod
             def forward(ctx, x):
-                return x.mean()
+                return x
 
             @staticmethod
             def backward(ctx, dy):
                 return dy
 
+        class Layer(paddle.nn.Layer):
+            def __init__(self):
+                super(Layer, self).__init__()
+
+            def forward(self, data):
+                data = paddle.nn.functional.relu(data)
+                z = paddle.tanh(data)
+                z = cus_tanh.apply(data)
+                return z.mean()
+
         for i in range(2):
             data = paddle.ones([2, 3], dtype="float64") / (i + 1)
             data.stop_gradient = False
+            layer = Layer()
+            z = layer(data)
+            z.backward()
+            self.assertTrue(data.grad is not None)
+
+    def test_backward_in_backward(self):
+        class cus_tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                temp = x.detach()
+                ctx.inputs = temp
+                return x.mean()
+
+            @staticmethod
+            def backward(ctx, dy):
+                with paddle.set_grad_enabled(True):
+                    temp = ctx.inputs
+                    temp.stop_gradient = False
+                    z = paddle.tanh(temp)
+                    z.backward()
+                    self.assertTrue(temp.grad is not None)
+                    return paddle.to_tensor(temp.grad)
+
+        for i in range(2):
+            data = paddle.ones([2, 3], dtype="float32") / (i + 1)
+            data.stop_gradient = False
             data = paddle.nn.functional.relu(data)
             z = paddle.tanh(data)
             z = cus_tanh.apply(data)
-            z.backward()
-            self.assertTrue(data.grad is not None)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py
index 052704659b6ed7..c1ce032f506127 100644
--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
@@ -17,9 +17,11 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+from paddle.static import Program, program_guard
 
 
 class TestScaleOp(OpTest):
@@ -168,5 +170,45 @@ def test_scale_selected_rows_inplace(self):
             self.check_with_place(place, 'in', 'in')
 
 
+class TestScaleApiStatic(unittest.TestCase):
+    def _executed_api(self, x, scale=1.0, bias=0.0):
+        return paddle.scale(x, scale, bias)
+
+    def test_api(self):
+        paddle.enable_static()
+        input = np.random.random([2, 25]).astype("float32")
+        main_prog = Program()
+        with program_guard(main_prog, Program()):
+            x = paddle.static.data(name="x", shape=[2, 25], dtype="float32")
+            out = self._executed_api(x, scale=2.0, bias=3.0)
+
+        exe = paddle.static.Executor(place=paddle.CPUPlace())
+        out = exe.run(main_prog, feed={"x": input}, fetch_list=[out])
+        self.assertEqual(np.array_equal(out[0], input * 2.0 + 3.0), True)
+
+
+class TestScaleInplaceApiStatic(TestScaleApiStatic):
+    def _executed_api(self, x, scale=1.0, bias=0.0):
+        return x.scale_(scale, bias)
+
+
+class TestScaleApiDygraph(unittest.TestCase):
+    def _executed_api(self, x, scale=1.0, bias=0.0):
+        return paddle.scale(x, scale, bias)
+
+    def test_api(self):
+        paddle.disable_static()
+        input = np.random.random([2, 25]).astype("float32")
+        x = paddle.to_tensor(input)
+        out = self._executed_api(x, scale=2.0, bias=3.0)
+        self.assertEqual(np.array_equal(out.numpy(), input * 2.0 + 3.0), True)
+        paddle.enable_static()
+
+
+class TestScaleInplaceApiDygraph(TestScaleApiDygraph):
+    def _executed_api(self, x, scale=1.0, bias=0.0):
+        return x.scale_(scale, bias)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
index 35bb4487c6aaee..59d1ede5a0b534 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
@@ -242,7 +242,7 @@ def check_raise_is_test():
                 output5 = fluid.layers.scatter_nd_add(ref5, index5, updates5)
             except Exception as e:
                 t = \
-                "Input(Index).shape[-1] should be no greater than Input(X).rank"
+                "The last dimension of Input(Index)'s shape should be no greater "
                 if t in str(e):
                     raise IndexError
 
diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py
index 0885891cdbe027..9534e4fe954166 100644
--- a/python/paddle/fluid/tests/unittests/test_set_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py
@@ -775,5 +775,76 @@ def test_error(self):
         self._broadcast_mismatch()
 
 
+# 5. Test backward
+
+
+class Model(paddle.nn.Layer):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.conv = paddle.nn.Conv2D(12, 12, 3)
+
+    def forward(self, x, y):
+        x = self.conv(x)
+        y = self.conv(y)
+        var = y.flatten()
+
+        x[0, :, 0, 0] = var
+        loss = paddle.mean(x)
+        return loss, var, x
+
+
+class TestBackward(unittest.TestCase):
+    def test_static(self):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+
+        x_np = np.random.random(size=(4, 4)).astype('float32')
+        y_np = np.random.random(size=(4, 4)).astype('float32')
+        label_np = np.random.randint(2, size=(4, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_program, startup_program):
+            x = paddle.static.data(name="x", shape=[4, 4], dtype='float32')
+            y = paddle.static.data(name="y", shape=[4, 4], dtype='float32')
+
+            label = paddle.static.data(
+                name="label", shape=[4, 1], dtype='int64')
+
+            z = paddle.add(x, y)
+            var = y[0, :]
+            z[0, :] = var
+
+            prediction = paddle.static.nn.fc(x=z, size=2, activation='softmax')
+
+            cost = paddle.nn.functional.cross_entropy(
+                input=prediction, label=label)
+            loss = paddle.mean(cost)
+            sgd = paddle.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        exe = paddle.static.Executor(paddle.CPUPlace())
+        exe.run(startup_program)
+
+        var_grad, z_grad = exe.run(
+            main_program,
+            feed={"x": x_np,
+                  "y": y_np,
+                  "label": label_np},
+            fetch_list=[var.name + "@GRAD", z.name + "@GRAD"])
+
+        self.assertTrue((var_grad == z_grad[0, :]).all())
+
+    def test_dynamic(self):
+        paddle.disable_static()
+        model = Model()
+        x = paddle.ones([1, 12, 3, 3]).astype("float32")
+        y = paddle.ones([1, 12, 3, 3]).astype("float32")
+        loss, var, x = model(x, y)
+        loss.backward()
+
+        self.assertTrue(var.grad.shape == x.grad[0, :, 0, 0].shape)
+        self.assertTrue((var.grad == x.grad[0, :, 0, 0]).all())
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
index 0717ec80f6a139..fa8ff4effcfd37 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
@@ -76,8 +76,7 @@ def create_sparse_grad_var(self, scope, place, height, rows, row_numel):
         grad_selected_rows = scope.var('Grad').get_selected_rows()
         grad_selected_rows.set_height(height)
         grad_selected_rows.set_rows(rows)
-        # grad_array = np.random.random((len(rows), row_numel)).astype('float32')
-        grad_array = np.full((len(rows), row_numel), 2, np.float32)
+        grad_array = np.random.random((len(rows), row_numel)).astype('float32')
         np_array_bf16 = convert_float_to_uint16(grad_array)
 
         grad_tensor = grad_selected_rows.get_tensor()
@@ -87,8 +86,7 @@ def create_sparse_grad_var(self, scope, place, height, rows, row_numel):
 
     def create_dense_param_var(self, scope, place, height, width):
         param_tensor = scope.var('Param').get_tensor()
-        # param_array = np.random.random((height, width)).astype('float32')
-        param_array = np.full((height, width), 5, np.float32)
+        param_array = np.random.random((height, width)).astype('float32')
         param_array_bf16 = convert_float_to_uint16(param_array)
         param_tensor.set(param_array_bf16, place)
 
@@ -109,8 +107,7 @@ def create_sparse_param_var(self, scope, place, height, rows, row_numel):
 
     def create_dense_lr_var(self, scope, place):
         lr_tensor = scope.var('LearningRate').get_tensor()
-        # lr_value = np.random.uniform()
-        lr_value = 2
+        lr_value = np.random.uniform()
         lr_array = np.full((1), lr_value, np.float32)
         lr_array_bf16 = convert_float_to_uint16(lr_array)
         lr_tensor.set(lr_array_bf16, place)
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
index e1f5ecf2683042..e754999d5d2055 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -56,7 +56,7 @@ def initParams(self):
         self.axis = -1
         self.ignore_index = -1
         self.shape = [41, 37]
-        self.softmax_switch = True
+        self.use_softmax = True
 
     def setUp(self):
         self.initParams()
@@ -77,7 +77,7 @@ def setUp(self):
         loss = cross_entropy(softmax, labels, self.soft_label, self.axis,
                              self.ignore_index)
 
-        if self.softmax_switch == False:
+        if self.use_softmax == False:
             self.inputs = {"Logits": softmax, "Label": labels}
         else:
             self.inputs = {"Logits": logits, "Label": labels}
@@ -90,7 +90,7 @@ def setUp(self):
             "numeric_stable_mode": self.numeric_stable_mode,
             "soft_label": self.soft_label,
             "ignore_index": self.ignore_index,
-            "softmax_switch": self.softmax_switch,
+            "use_softmax": self.use_softmax,
         }
 
         if self.axis != -1:
@@ -117,7 +117,7 @@ def initParams(self):
         self.axis = -1
         self.ignore_index = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_1D(
@@ -130,7 +130,7 @@ def initParams(self):
         self.axis = -1
         self.ignore_index = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 ##############################################################################
@@ -146,7 +146,7 @@ def initParams(self):
         self.axis = -1
         self.ignore_index = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D_Axis2(
@@ -159,7 +159,7 @@ def initParams(self):
         self.axis = 1
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D_Axis3(
@@ -172,7 +172,7 @@ def initParams(self):
         self.axis = 2
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D_Axis4(
@@ -185,7 +185,7 @@ def initParams(self):
         self.axis = 3
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 ##############################################################################
@@ -207,7 +207,7 @@ def initParams(self):
         self.axis = -1
         self.ignore_index = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Axis2(
@@ -220,7 +220,7 @@ def initParams(self):
         self.axis = 1
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Axis3(
@@ -233,7 +233,7 @@ def initParams(self):
         self.axis = 2
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Axis4(
@@ -246,7 +246,7 @@ def initParams(self):
         self.axis = 3
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 ##############################################################################
@@ -268,7 +268,7 @@ def initParams(self):
         self.axis = -1
         self.ignore_index = 2
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_Ignore_Axis(
@@ -281,7 +281,7 @@ def initParams(self):
         self.axis = 1
         self.ignore_index = 2
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Ignore(
@@ -294,7 +294,7 @@ def initParams(self):
         self.axis = -1
         self.ignore_index = 2
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Ignore_Axis3(
@@ -307,7 +307,7 @@ def initParams(self):
         self.axis = 2
         self.ignore_index = 2
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = False  #default is true, means "with softmax"
+        self.use_softmax = False  #default is true, means "with softmax"
 
 
 ##############################################################################
@@ -324,7 +324,7 @@ def initParams(self):
         self.axis = -1
         self.ignore_index = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -403,7 +403,7 @@ def initParams(self):
         self.axis = -1
         self.ignore_index = -1
         self.shape = [41, 37]
-        self.softmax_switch = True
+        self.use_softmax = True
 
     def test_check_output(self):
         self.check_output()
@@ -429,7 +429,7 @@ def initParams(self):
         self.ignore_index = 5
         self.axis = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOp3NoCudnn(TestSoftmaxWithCrossEntropyOp3):
@@ -441,7 +441,7 @@ def initParams(self):
         self.ignore_index = 4
         self.axis = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpAxis1(TestSoftmaxWithCrossEntropyOp):
@@ -458,7 +458,7 @@ def initParams(self):
         self.axis = 0
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpAxis2(TestSoftmaxWithCrossEntropyOp):
@@ -475,7 +475,7 @@ def initParams(self):
         self.axis = 1
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpAxis3(TestSoftmaxWithCrossEntropyOp):
@@ -492,7 +492,7 @@ def initParams(self):
         self.axis = 2
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpAxis4(TestSoftmaxWithCrossEntropyOp):
@@ -509,7 +509,7 @@ def initParams(self):
         self.axis = 3
         self.ignore_index = -1
         self.shape = [3, 5, 7, 11]
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpAxisDimEqualOne(
@@ -527,7 +527,7 @@ def initParams(self):
         self.axis = -1
         self.ignore_index = -1
         self.shape = [3, 5, 7, 1]
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis1(
@@ -540,7 +540,7 @@ def initParams(self):
         self.axis = 0
         self.ignore_index = -1
         self.dtype = np.float16
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis2(
@@ -553,7 +553,7 @@ def initParams(self):
         self.axis = 1
         self.ignore_index = -1
         self.dtype = np.float16
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis3(
@@ -566,7 +566,7 @@ def initParams(self):
         self.axis = 2
         self.ignore_index = -1
         self.dtype = np.float16
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpSoftLabelAxis1(
@@ -579,7 +579,7 @@ def initParams(self):
         self.axis = 0
         self.ignore_index = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpSoftLabelAxis2(
@@ -592,7 +592,7 @@ def initParams(self):
         self.axis = 1
         self.ignore_index = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpSoftLabelAxis3(
@@ -605,7 +605,7 @@ def initParams(self):
         self.axis = 2
         self.ignore_index = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpSoftLabelAxis4(
@@ -618,7 +618,7 @@ def initParams(self):
         self.axis = 3
         self.ignore_index = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis1(
@@ -631,7 +631,7 @@ def initParams(self):
         self.ignore_index = 1
         self.axis = 0
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis2(
@@ -644,7 +644,7 @@ def initParams(self):
         self.ignore_index = 0
         self.axis = 1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis3(
@@ -657,7 +657,7 @@ def initParams(self):
         self.ignore_index = 3
         self.axis = 2
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis4(
@@ -670,7 +670,7 @@ def initParams(self):
         self.ignore_index = 3
         self.axis = 3
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpBoundary0(TestSoftmaxWithCrossEntropyOp):
@@ -688,7 +688,7 @@ def initParams(self):
         self.ignore_index = -1
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.logits = np.full(self.shape, -500.0).astype(self.dtype)
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 class TestSoftmaxWithCrossEntropyOpBoundary1(TestSoftmaxWithCrossEntropyOp):
@@ -707,7 +707,7 @@ def initParams(self):
         self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64
         self.logits = np.full(self.shape, 1000.0).astype(self.dtype)
         self.logits[:, :, 0, :] = -1000.0
-        self.softmax_switch = True
+        self.use_softmax = True
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel.py
new file mode 100644
index 00000000000000..6f2f7408262d94
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+import os
+import paddle
+
+paddle.enable_static()
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestStaticModelParallel(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl_comm_num = 1
+        self._pipeline_mode = True
+
+    def test_dist_static_model_parallel(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_by_row.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+    def test_dist_static_model_parallel2(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_by_col.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+    def test_dist_static_model_parallel3(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_embedding.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load.py b/python/paddle/fluid/tests/unittests/test_static_save_load.py
index 51c543c5f7464f..cfce0bb7d311bb 100644
--- a/python/paddle/fluid/tests/unittests/test_static_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py
@@ -19,7 +19,7 @@
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.dygraph.nn import Embedding
+from paddle.nn import Embedding
 import paddle.fluid.framework as framework
 from paddle.fluid.optimizer import Adam
 from paddle.fluid.dygraph.base import to_variable
@@ -31,6 +31,8 @@
 import os
 import errno
 
+paddle.enable_static()
+
 
 class SimpleLSTMRNN(fluid.Layer):
     def __init__(self,
@@ -159,11 +161,10 @@ def __init__(self,
             num_layers=num_layers,
             init_scale=init_scale,
             dropout=dropout)
-        self.embedding = Embedding(
-            size=[vocab_size, hidden_size],
-            dtype='float32',
-            is_sparse=False,
-            param_attr=fluid.ParamAttr(
+        self.embedding = paddle.nn.Embedding(
+            num_embeddings=vocab_size,
+            embedding_dim=hidden_size,
+            weight_attr=fluid.ParamAttr(
                 name='embedding_para',
                 initializer=fluid.initializer.UniformInitializer(
                     low=-init_scale, high=init_scale)))
@@ -187,6 +188,8 @@ def forward(self, input, label, init_hidden, init_cell):
         init_c = fluid.layers.reshape(
             init_cell, shape=[self.num_layers, -1, self.hidden_size])
 
+        # NPU 'tok_k' kernel only support `int32` dtype, so cast `input` from `int64` to `int32`.
+        input = fluid.layers.cast(input, "int32")
         x_emb = self.embedding(input)
         x_emb = fluid.layers.reshape(
             x_emb, shape=[-1, self.num_steps, self.hidden_size])
@@ -214,6 +217,10 @@ def forward(self, input, label, init_hidden, init_cell):
 
 
 class TestSaveLoadBase(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -235,8 +242,7 @@ def test_ptb_rnn_cpu_float32(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -315,6 +321,10 @@ def test_ptb_rnn_cpu_float32(self):
 
 
 class TestSaveLoadPartial(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -336,8 +346,7 @@ def test_ptb_rnn_cpu_float32(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -425,6 +434,10 @@ def test_ptb_rnn_cpu_float32(self):
 
 
 class TestSaveLoadSetStateDict(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -446,8 +459,7 @@ def test_ptb_rnn_cpu_float32(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -526,6 +538,10 @@ def test_ptb_rnn_cpu_float32(self):
 
 
 class TestProgramStatePartial(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -547,8 +563,7 @@ def test_ptb_rnn_cpu_float32(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -708,14 +723,17 @@ def test_ptb_rnn_cpu_float32(self):
 
 
 class TestVariableInit(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_variable_init(self):
 
         x = fluid.data(name="x", shape=[10, 10], dtype='float32')
         y = fluid.layers.fc(x, 10)
         z = fluid.layers.fc(y, 10)
 
-        place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
+        place = self.set_place()
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
 
@@ -738,8 +756,7 @@ def set_var(var, ndarray):
         program = fluid.default_main_program()
         new_scope = fluid.core.Scope()
 
-        place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-        ) else fluid.CUDAPlace(0)
+        place = self.set_place()
         exe = fluid.Executor(place)
         parameter_list = list(
             filter(fluid.io.is_parameter, program.list_vars()))
@@ -798,6 +815,10 @@ def setUp(self):
         if os.path.exists("test_static_load_var_list.pdparams"):
             os.remove("test_static_load_var_list.pdparams")
 
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_load_from_old_interface(self):
         seed = 90
         hidden_size = 10
@@ -819,8 +840,7 @@ def test_load_from_old_interface(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -935,8 +955,7 @@ def test_load_from_old_interface_var_list(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -1027,6 +1046,10 @@ def test_load_from_old_interface_var_list(self):
 
 
 class TestLoadFromOldInterfaceSingleFile(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_load_from_old_interface(self):
         seed = 90
         hidden_size = 10
@@ -1048,8 +1071,7 @@ def test_load_from_old_interface(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -1171,6 +1193,13 @@ def test_load_from_old_interface(self):
 
 
 class TestProgramStateOldSave(unittest.TestCase):
+    def setUp(self):
+        self.test_dygraph = True
+
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -1192,8 +1221,7 @@ def test_ptb_rnn_cpu_float32(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
@@ -1299,11 +1327,12 @@ def symlink_force(target, link_name):
             fluid.set_program_state(main_program, program_state)
             self.check_in_static(main_program, base_map)
 
-        # make sure `load_program_state` can be used in dynamic graph mode
-        with fluid.dygraph.guard(place):
-            load_state = fluid.load_program_state("test_program_1")
-            for k, v in load_state.items():
-                self.assertTrue(np.array_equal(base_map[k], v))
+        if self.test_dygraph:
+            # make sure `load_program_state` can be used in dynamic graph mode
+            with fluid.dygraph.guard(place):
+                load_state = fluid.load_program_state("test_program_1")
+                for k, v in load_state.items():
+                    self.assertTrue(np.array_equal(base_map[k], v))
 
     def create_symlink(self, target, link_name):
         try:
@@ -1323,6 +1352,10 @@ def check_in_static(self, main_program, base_map):
 
 
 class TestProgramStateOldSaveSingleModel(unittest.TestCase):
+    def set_place(self):
+        return fluid.CPUPlace() if not core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+
     def test_ptb_rnn_cpu_float32(self):
         seed = 90
         hidden_size = 10
@@ -1344,8 +1377,7 @@ def test_ptb_rnn_cpu_float32(self):
                 num_steps=num_steps,
                 init_scale=init_scale)
 
-            place = fluid.CPUPlace() if not core.is_compiled_with_cuda(
-            ) else fluid.CUDAPlace(0)
+            place = self.set_place()
             exe = fluid.Executor(place)
             sgd = Adam(learning_rate=1e-3)
             x = fluid.layers.data(
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index 35dc92ffb08c64..f9e40cf8133d70 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -18,9 +18,12 @@
 import numpy as np
 from op_test import OpTest
 import paddle
+from paddle import enable_static
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+from paddle.fluid.tests.unittests.op_test import (
+    OpTest, convert_float_to_uint16, convert_uint16_to_float)
 
 
 class TestSumOp(OpTest):
@@ -141,6 +144,73 @@ def test_w_is_selected_rows(self):
                 self.check_with_place(place, inplace)
 
 
+class TestSelectedRowsSumOpInt(TestSelectedRowsSumOp):
+    def init_kernel_type(self):
+        self.dtype = np.int32
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSelectedRowsSumBF16Op(TestSelectedRowsSumOp):
+    def setUp(self):
+        self.height = 10
+        self.row_numel = 12
+        self.rows = [0, 1, 2, 3, 4, 5, 6]
+        self.dtype = np.uint16
+        self.init_kernel_type()
+        np.random.seed(12345)
+        self.data = np.random.random((len(self.rows),
+                                      self.row_numel)).astype(np.float32)
+
+    def _get_array(self, rows, row_numel):
+        if len(rows) > 0:
+            return convert_float_to_uint16(self.data)
+        else:
+            return np.ndarray((0, row_numel), dtype=self.dtype)
+
+    def check_input_and_optput(self,
+                               scope,
+                               place,
+                               inplace,
+                               w1_has_data=False,
+                               w2_has_data=False,
+                               w3_has_data=False):
+
+        self.create_selected_rows(scope, place, "W1", w1_has_data)
+        self.create_selected_rows(scope, place, "W2", w2_has_data)
+        self.create_selected_rows(scope, place, "W3", w3_has_data)
+
+        # create Out Variable
+        if inplace:
+            out_var_name = "W1"
+        else:
+            out_var_name = "Out"
+        out = scope.var(out_var_name).get_selected_rows()
+
+        # create and run sum operator
+        sum_op = Operator("sum", X=["W1", "W2", "W3"], Out=out_var_name)
+        sum_op.run(scope, place)
+
+        has_data_w_num = 0
+        for has_data in [w1_has_data, w2_has_data, w3_has_data]:
+            if has_data:
+                has_data_w_num += 1
+
+        if has_data_w_num > 0:
+            self.assertEqual(len(out.rows()), 7)
+            out_bf16 = np.array(out.get_tensor())
+            out_fp32 = convert_uint16_to_float(out_bf16)
+            ref_fp32 = convert_uint16_to_float(
+                self._get_array(self.rows, self.row_numel)) * has_data_w_num
+            np.testing.assert_allclose(out_fp32, ref_fp32, atol=0, rtol=0.95e-2)
+        else:
+            self.assertEqual(len(out.rows()), 0)
+
+    def test_w_is_selected_rows(self):
+        for inplace in [True, False]:
+            self.check_with_place(core.CPUPlace(), inplace)
+
+
 class TestLoDTensorAndSelectedRowsOp(TestSelectedRowsSumOp):
     def setUp(self):
         self.height = 10
@@ -324,4 +394,5 @@ def test_list_of_none_input():
 create_test_sum_fp16_class(TestLoDTensorAndSelectedRowsOp)
 
 if __name__ == "__main__":
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
index 50b00ab34fd096..52256766fed758 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -39,6 +39,21 @@ def forward(self, x, hook=None, register=False, remove=False):
         return ret1, out
 
 
+class SimpleNetForStatic(nn.Layer):
+    def __init__(self, in_size, out_size):
+        super(SimpleNetForStatic, self).__init__()
+        self.linear1 = nn.Linear(in_size, in_size)
+        self.linear2 = nn.Linear(in_size, out_size)
+
+    def forward(self, x):
+        ret1 = self.linear1(x)
+        ret1.register_hook(lambda grad: grad * 2)
+
+        ret2 = self.linear2(ret1)
+        out = paddle.mean(ret2, axis=-1)
+        return out
+
+
 class TestTensorRegisterHook(unittest.TestCase):
     def setUp(self):
         self.seed = 2021
@@ -75,15 +90,15 @@ def run_double_hook_for_interior_var(double_hook, removed=False):
                 o.backward()
 
                 # z.grad is not affected
-                self.assertTrue(np.array_equal(z.grad, w.numpy()))
+                self.assertTrue(np.array_equal(z.grad.numpy(), w.numpy()))
                 # w.grad is not changed by hook
-                self.assertTrue(np.array_equal(w.grad, z.numpy()))
+                self.assertTrue(np.array_equal(w.grad.numpy(), z.numpy()))
                 # x.grad and y.grad are changed if run hook
                 self.assertTrue(
-                    np.array_equal(x.grad,
+                    np.array_equal(x.grad.numpy(),
                                    z.numpy() * 2 if not removed else z.numpy()))
                 self.assertTrue(
-                    np.array_equal(y.grad,
+                    np.array_equal(y.grad.numpy(),
                                    z.numpy() * 2 if not removed else z.numpy()))
 
         def run_print_hook_for_interior_var(print_hook, removed=False):
@@ -111,10 +126,10 @@ def run_print_hook_for_interior_var(print_hook, removed=False):
                 o.backward()
 
                 # all grads are not affected
-                self.assertTrue(np.array_equal(z.grad, w.numpy()))
-                self.assertTrue(np.array_equal(w.grad, z.numpy()))
-                self.assertTrue(np.array_equal(x.grad, z.numpy()))
-                self.assertTrue(np.array_equal(y.grad, z.numpy()))
+                self.assertTrue(np.array_equal(z.grad.numpy(), w.numpy()))
+                self.assertTrue(np.array_equal(w.grad.numpy(), z.numpy()))
+                self.assertTrue(np.array_equal(x.grad.numpy(), z.numpy()))
+                self.assertTrue(np.array_equal(y.grad.numpy(), z.numpy()))
 
         def double_hook(grad):
             grad = grad * 2
@@ -165,12 +180,12 @@ def run_double_hook_for_leaf_var(double_hook, removed=False):
                 o.backward()
 
                 # z.grad, w.grad, x.grad is not affected
-                self.assertTrue(np.array_equal(z.grad, w.numpy()))
-                self.assertTrue(np.array_equal(w.grad, z.numpy()))
-                self.assertTrue(np.array_equal(x.grad, z.numpy()))
+                self.assertTrue(np.array_equal(z.grad.numpy(), w.numpy()))
+                self.assertTrue(np.array_equal(w.grad.numpy(), z.numpy()))
+                self.assertTrue(np.array_equal(x.grad.numpy(), z.numpy()))
                 # y.grad are changed if run hook
                 self.assertTrue(
-                    np.array_equal(y.grad,
+                    np.array_equal(y.grad.numpy(),
                                    z.numpy() * 2 if not removed else z.numpy()))
 
         # register hook
@@ -217,14 +232,14 @@ def run_double_hook_for_accumulated_grad_interior_var(double_hook,
 
                 base_grad = np.array([5., 9., 13., 19.])
                 # x.grad is not changed
-                self.assertTrue(np.array_equal(x.grad, base_grad))
+                self.assertTrue(np.array_equal(x.grad.numpy(), base_grad))
                 # b.grad is changed by x.hook
                 self.assertTrue(
-                    np.array_equal(b.grad, base_grad * 2
+                    np.array_equal(b.grad.numpy(), base_grad * 2
                                    if not removed else base_grad))
                 # a.grad is changed by x.hook and a.hook
                 self.assertTrue(
-                    np.array_equal(a.grad, base_grad * 4
+                    np.array_equal(a.grad.numpy(), base_grad * 4
                                    if not removed else base_grad))
 
         # register hook
@@ -265,7 +280,7 @@ def run_double_hook_for_accumulated_grad_leaf_var(double_hook,
                 base_grad = np.array([5., 9., 13., 19.])
                 # x.grad is changed by x.hook
                 self.assertTrue(
-                    np.array_equal(x.grad, base_grad * 2
+                    np.array_equal(x.grad.numpy(), base_grad * 2
                                    if not removed else base_grad))
 
         # register hook
@@ -294,7 +309,8 @@ def run_double_hook_in_model(data,
                 loss = loss_fn(out, label)
                 loss.backward()
 
-                return ret1.grad, net.linear1.weight.grad, net.linear1.bias.grad
+                return (ret1.grad.numpy(), net.linear1.weight.grad.numpy(),
+                        net.linear1.bias.grad.numpy())
 
         data = np.random.uniform(
             size=[self.batch_size, self.in_size]).astype('float32')
@@ -355,7 +371,7 @@ def run_multiple_hooks_for_interior_var(device,
 
             o.backward()
 
-            return z.numpy(), w.grad, x.grad, y.grad
+            return z.numpy(), w.grad.numpy(), x.grad.numpy(), y.grad.numpy()
 
         def double_hook(grad):
             return grad * 2
@@ -428,7 +444,7 @@ def double_print_hook(grad):
         # after changed by hook: 8.0
 
         z.backward()
-        self.assertTrue(np.array_equal(x.grad, np.array([8.])))
+        self.assertTrue(np.array_equal(x.grad.numpy(), np.array([8.])))
 
     def test_remove_one_hook_multiple_times(self):
         for device in self.devices:
@@ -450,6 +466,34 @@ def test_register_hook_for_stop_gradient_var(self):
             with self.assertRaises(RuntimeError):
                 x.register_hook(lambda grad: grad * 2)
 
+    def test_register_hook_in_static_mode(self):
+        paddle.enable_static()
+
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        with paddle.static.scope_guard(paddle.static.Scope()):
+            with paddle.static.program_guard(main_program, startup_program):
+                x = paddle.static.data(
+                    name='x', shape=[None, self.in_size], dtype='float32')
+
+                net = SimpleNetForStatic(self.in_size, self.out_size)
+                with self.assertRaises(AssertionError):
+                    out = net(x)
+
+        paddle.disable_static()
+
+    def test_register_hook_in_dy2static_mode(self):
+        net = SimpleNetForStatic(self.in_size, self.out_size)
+        jit_net = paddle.jit.to_static(
+            net, input_spec=[paddle.static.InputSpec([None, self.in_size])])
+
+        data = np.random.uniform(
+            size=[self.batch_size, self.in_size]).astype('float32')
+        data_t = paddle.to_tensor(data)
+
+        with self.assertRaises(AssertionError):
+            out = jit_net(data_t)
+
 
 HOOK_INIT_VALUE = 10
 HOOK_IS_CALLED = False
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_to_list.py b/python/paddle/fluid/tests/unittests/test_tensor_to_list.py
new file mode 100644
index 00000000000000..73b91297e6fd62
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tensor_to_list.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import unittest
+import numpy as np
+import six
+import paddle
+
+
+class TensorToListTest(unittest.TestCase):
+    def setUp(self):
+        self.shape = [11, 25, 32, 43]
+
+    def test_tensor_tolist(self):
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+            places.append(fluid.CUDAPinnedPlace())
+
+        for p in places:
+            np_arr = np.reshape(
+                np.array(six.moves.range(np.prod(self.shape))), self.shape)
+            expectlist = np_arr.tolist()
+
+            t = paddle.to_tensor(np_arr, place=p)
+            tensorlist = t.tolist()
+
+            self.assertEqual(tensorlist, expectlist)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unfold_op.py b/python/paddle/fluid/tests/unittests/test_unfold_op.py
index e24368e052dd20..7295cb83816000 100644
--- a/python/paddle/fluid/tests/unittests/test_unfold_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unfold_op.py
@@ -18,6 +18,9 @@
 import numpy as np
 import unittest
 from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
 
 
 class TestUnfoldOp(OpTest):
@@ -98,5 +101,30 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Y')
 
 
+class TestUnfoldAPI(TestUnfoldOp):
+    """
+    This is for test on paddle.nn.Unfold
+    """
+
+    def setUp(self):
+        self.op_type = 'unfold'
+        self.set_data()
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input = fluid.dygraph.to_variable(self.inputs['X'])
+                m = paddle.nn.Unfold(**self.attrs)
+                m.eval()
+                result = m(input)
+                self.assertTrue(np.allclose(result.numpy(), self.outputs['Y']))
+
+    def test_info(self):
+        str(paddle.nn.Unfold(**self.attrs))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
new file mode 100644
index 00000000000000..2ba808a341e5eb
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_bf16_op.py
@@ -0,0 +1,276 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, convert_uint16_to_float, convert_float_to_uint16
+import paddle
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+from paddle.fluid.tests.unittests.test_uniform_random_op import output_hist, output_hist_diag
+
+
+class TestUniformRandomOpBF16(OpTest):
+    def setUp(self):
+        self.op_type = "uniform_random"
+        self.dtype = "uint16"
+        self.inputs = {}
+        self.init_attrs()
+        self.outputs = {"Out": np.zeros((1000, 784)).astype("uint16")}
+
+    def init_attrs(self):
+        self.attrs = {
+            "shape": [1000, 784],
+            "min": -5.0,
+            "max": 10.0,
+            "seed": 10,
+            'dtype': int(core.VarDesc.VarType.BF16)
+        }
+        self.output_hist = output_hist
+
+    def verify_output(self, outs):
+        if np.array(outs[0]).dtype == np.uint16:
+            result = convert_uint16_to_float(np.array(outs[0]))
+        else:
+            result = np.array(outs[0])
+
+        hist, prob = self.output_hist(result)
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+    def test_check_output(self):
+        outs = self.calc_output(core.CPUPlace())
+        outs = [np.array(out) for out in outs]
+        outs.sort(key=len)
+        self.verify_output(outs)
+
+
+class TestUniformRandomOpBF16AttrTensorList(TestUniformRandomOpBF16):
+    def setUp(self):
+        self.op_type = "uniform_random"
+        self.new_shape = (1000, 784)
+        self.dtype = "uint16"
+        shape_tensor = []
+        for index, ele in enumerate(self.new_shape):
+            shape_tensor.append(("x" + str(index), np.ones(
+                (1)).astype("int64") * ele))
+        self.inputs = {'ShapeTensorList': shape_tensor}
+        self.init_attrs()
+        self.outputs = {"Out": np.zeros((1000, 784)).astype("uint16")}
+
+    def init_attrs(self):
+        self.attrs = {
+            "min": -5.0,
+            "max": 10.0,
+            "seed": 10,
+            'dtype': int(core.VarDesc.VarType.BF16)
+        }
+        self.output_hist = output_hist
+
+
+class TestUniformRandomOpBF16AttrTensorInt32(
+        TestUniformRandomOpBF16AttrTensorList):
+    def setUp(self):
+        self.op_type = "uniform_random"
+        self.dtype = "uint16"
+        self.inputs = {"ShapeTensor": np.array([1000, 784]).astype("int32")}
+        self.init_attrs()
+        self.outputs = {"Out": np.zeros((1000, 784)).astype("uint16")}
+
+
+class TestUniformRandomOpBF16WithDiagInit(TestUniformRandomOpBF16):
+    def init_attrs(self):
+        self.attrs = {
+            "shape": [1000, 784],
+            "min": -5.0,
+            "max": 10.0,
+            "seed": 10,
+            "diag_num": 784,
+            "diag_step": 784,
+            "diag_val": 1.0,
+            'dtype': int(core.VarDesc.VarType.BF16)
+        }
+        self.output_hist = output_hist_diag
+
+
+class TestUniformRandomOpBF16SelectedRows(unittest.TestCase):
+    def test_check_output(self):
+        self.check_with_place(core.CPUPlace())
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        out = scope.var("X").get_selected_rows()
+        paddle.seed(10)
+        op = Operator(
+            "uniform_random",
+            Out="X",
+            shape=[1000, 784],
+            min=-5.0,
+            max=10.0,
+            seed=10,
+            dtype=int(core.VarDesc.VarType.BF16))
+        op.run(scope, place)
+        self.assertEqual(out.get_tensor().shape(), [1000, 784])
+        result = convert_uint16_to_float(np.array(out.get_tensor()))
+        hist, prob = output_hist(result)
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+class TestUniformRandomOpBF16SelectedRowsWithDiagInit(
+        TestUniformRandomOpBF16SelectedRows):
+    def check_with_place(self, place):
+        scope = core.Scope()
+        out = scope.var("X").get_selected_rows()
+        paddle.seed(10)
+        op = Operator(
+            "uniform_random",
+            Out="X",
+            shape=[500, 784],
+            min=-5.0,
+            max=10.0,
+            seed=10,
+            diag_num=500,
+            diag_step=784,
+            diag_val=1.0,
+            dtype=int(core.VarDesc.VarType.BF16))
+        op.run(scope, place)
+        self.assertEqual(out.get_tensor().shape(), [500, 784])
+        result = convert_uint16_to_float(np.array(out.get_tensor()))
+        hist, prob = output_hist(result)
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+class TestUniformRandomOpBF16AttrTensorAPI(unittest.TestCase):
+    def test_attr_tensor_API(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            dim_tensor = fluid.layers.fill_constant([1], "int64", 3)
+            ret = fluid.layers.nn.uniform_random(
+                [1, dim_tensor, 2], dtype=np.uint16)
+
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+
+            exe.run(startup_program)
+            outs = exe.run(train_program, fetch_list=[ret])
+
+
+class TestUniformRandomOpAPISeed(unittest.TestCase):
+    def test_attr_tensor_API(self):
+        _seed = 10
+        gen = paddle.seed(_seed)
+        gen._is_init_py = False
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            _min = 5
+            _max = 10
+
+            ret = fluid.layers.nn.uniform_random(
+                [2, 3, 2], min=_min, max=_max, seed=_seed)
+            ret_2 = fluid.layers.nn.uniform_random(
+                [2, 3, 2], min=_min, max=_max, seed=_seed)
+            res = fluid.layers.equal(ret, ret_2)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+
+            exe.run(startup_program)
+            ret_value, cmp_value = exe.run(train_program, fetch_list=[ret, res])
+            self.assertTrue(np.array(cmp_value).all())
+            for i in ret_value.flatten():
+                self.assertGreaterEqual(i, _min)
+                self.assertLess(i, _max)
+
+
+class TestUniformRandomOpBF16SelectedRowsShapeTensor(unittest.TestCase):
+    def test_check_output(self):
+        place = core.CPUPlace()
+        scope = core.Scope()
+        out = scope.var("X").get_selected_rows()
+        shape_tensor = scope.var("Shape").get_tensor()
+        shape_tensor.set(np.array([1000, 784]).astype("int64"), place)
+        paddle.seed(10)
+        op = Operator(
+            "uniform_random",
+            ShapeTensor="Shape",
+            Out="X",
+            min=-5.0,
+            max=10.0,
+            seed=10,
+            dtype=int(core.VarDesc.VarType.BF16))
+        op.run(scope, place)
+        self.assertEqual(out.get_tensor().shape(), [1000, 784])
+        result = convert_uint16_to_float(np.array(out.get_tensor()))
+        hist, prob = output_hist(result)
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+class TestUniformRandomOpBF16SelectedRowsShapeTensorList(
+        TestUniformRandomOpBF16SelectedRowsShapeTensor):
+    def test_check_output(self):
+        place = core.CPUPlace()
+        scope = core.Scope()
+        out = scope.var("X").get_selected_rows()
+        shape_1 = scope.var("shape1").get_tensor()
+        shape_1.set(np.array([1000]).astype("int64"), place)
+        shape_2 = scope.var("shape2").get_tensor()
+        shape_2.set(np.array([784]).astype("int64"), place)
+        paddle.seed(10)
+        op = Operator(
+            "uniform_random",
+            ShapeTensorList=["shape1", "shape2"],
+            Out="X",
+            min=-5.0,
+            max=10.0,
+            seed=10,
+            dtype=int(core.VarDesc.VarType.BF16))
+        op.run(scope, place)
+        self.assertEqual(out.get_tensor().shape(), [1000, 784])
+        result = convert_uint16_to_float(np.array(out.get_tensor()))
+        hist, prob = output_hist(result)
+        self.assertTrue(
+            np.allclose(
+                hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
+
+
+class TestUniformRandomBatchSizeLikeOpBF16API(unittest.TestCase):
+    def test_attr_tensorlist_int32_API(self):
+        startup_program = fluid.Program()
+        train_program = fluid.Program()
+        with fluid.program_guard(train_program, startup_program):
+            input = fluid.data(name="input", shape=[1, 3], dtype='uint16')
+            out_1 = fluid.layers.uniform_random_batch_size_like(
+                input, [2, 4], dtype=np.uint16)  # out_1.shape=[1, 4]
+
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+
+            exe.run(startup_program)
+            outs = exe.run(train_program, fetch_list=[out_1])
+
+
+if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 1fea1935473a75..83f02b629d7acd 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -65,7 +65,8 @@ def _test_place(place):
                 y = clone_x**2
                 y.backward()
                 self.assertTrue(
-                    np.array_equal(x.grad, np.array([2.4]).astype('float32')))
+                    np.array_equal(x.grad.numpy(),
+                                   np.array([2.4]).astype('float32')))
                 y = x.cpu()
                 self.assertEqual(y.place.__repr__(), "CPUPlace")
                 if core.is_compiled_with_cuda():
@@ -142,6 +143,74 @@ def _test_place(place):
                 self.assertEqual(y.dtype, core.VarDesc.VarType.COMPLEX64)
                 self.assertEqual(y.shape, [2])
 
+                paddle.set_default_dtype('float32')
+                x = paddle.randn([3, 4])
+                x_array = np.array(x)
+                self.assertEqual(x_array.shape, x.numpy().shape)
+                self.assertEqual(x_array.dtype, x.numpy().dtype)
+                self.assertTrue(np.array_equal(x_array, x.numpy()))
+
+                x = paddle.to_tensor(1.0)
+                self.assertEqual(x.item(), 1.0)
+                self.assertTrue(isinstance(x.item(), float))
+
+                x = paddle.randn([3, 2, 2])
+                self.assertTrue(isinstance(x.item(5), float))
+                self.assertTrue(isinstance(x.item(1, 0, 1), float))
+                self.assertEqual(x.item(5), x.item(1, 0, 1))
+                self.assertTrue(
+                    np.array_equal(x.item(1, 0, 1), x.numpy().item(1, 0, 1)))
+
+                x = paddle.to_tensor([[1.111111, 2.222222, 3.333333]])
+                self.assertEqual(x.item(0, 2), x.item(2))
+                self.assertAlmostEqual(x.item(2), 3.333333)
+                self.assertTrue(isinstance(x.item(0, 2), float))
+
+                x = paddle.to_tensor(1.0, dtype='float64')
+                self.assertEqual(x.item(), 1.0)
+                self.assertTrue(isinstance(x.item(), float))
+
+                x = paddle.to_tensor(1.0, dtype='float16')
+                self.assertEqual(x.item(), 1.0)
+                self.assertTrue(isinstance(x.item(), float))
+
+                x = paddle.to_tensor(1, dtype='uint8')
+                self.assertEqual(x.item(), 1)
+                print(type(x.item()))
+                self.assertTrue(isinstance(x.item(), int))
+
+                x = paddle.to_tensor(1, dtype='int8')
+                self.assertEqual(x.item(), 1)
+                self.assertTrue(isinstance(x.item(), int))
+
+                x = paddle.to_tensor(1, dtype='int16')
+                self.assertEqual(x.item(), 1)
+                self.assertTrue(isinstance(x.item(), int))
+
+                x = paddle.to_tensor(1, dtype='int32')
+                self.assertEqual(x.item(), 1)
+                self.assertTrue(isinstance(x.item(), int))
+
+                x = paddle.to_tensor(1, dtype='int64')
+                self.assertEqual(x.item(), 1)
+                self.assertTrue(isinstance(x.item(), long if six.PY2 else int))
+
+                x = paddle.to_tensor(True)
+                self.assertEqual(x.item(), True)
+                self.assertTrue(isinstance(x.item(), bool))
+
+                x = paddle.to_tensor(1 + 1j)
+                self.assertEqual(x.item(), 1 + 1j)
+                self.assertTrue(isinstance(x.item(), complex))
+
+                with self.assertRaises(ValueError):
+                    paddle.randn([3, 2, 2]).item()
+                with self.assertRaises(ValueError):
+                    paddle.randn([3, 2, 2]).item(18)
+                with self.assertRaises(ValueError):
+                    paddle.randn([3, 2, 2]).item(1, 2)
+                with self.assertRaises(ValueError):
+                    paddle.randn([3, 2, 2]).item(2, 1, 2)
                 with self.assertRaises(TypeError):
                     paddle.to_tensor('test')
                 with self.assertRaises(TypeError):
@@ -255,19 +324,21 @@ def test_detach(self):
             detach_x = x.detach()
             self.assertTrue(detach_x.stop_gradient, True)
 
+            cmp_float = np.allclose if core.is_compiled_with_rocm(
+            ) else np.array_equal
             detach_x[:] = 10.0
-            self.assertTrue(np.array_equal(x.numpy(), [10.0]))
+            self.assertTrue(cmp_float(x.numpy(), [10.0]))
 
             y = x**2
             y.backward()
-            self.assertTrue(np.array_equal(x.grad, [20.0]))
+            self.assertTrue(cmp_float(x.grad.numpy(), [20.0]))
             self.assertEqual(detach_x.grad, None)
 
             detach_x.stop_gradient = False  # Set stop_gradient to be False, supported auto-grad
             z = 3 * detach_x**2
             z.backward()
-            self.assertTrue(np.array_equal(x.grad, [20.0]))
-            self.assertTrue(np.array_equal(detach_x.grad, [60.0]))
+            self.assertTrue(cmp_float(x.grad.numpy(), [20.0]))
+            self.assertTrue(cmp_float(detach_x.grad.numpy(), [60.0]))
 
             # Due to sharing of data with origin Tensor, There are some unsafe operations:
             with self.assertRaises(RuntimeError):
@@ -473,6 +544,70 @@ def _test_slice(self):
             np.array_equal(local_out[15], tensor_array[::-1, ::-1, ::-1]))
         self.assertTrue(np.array_equal(local_out[16], tensor_array[-4:4]))
 
+    def _test_slice_for_tensor_attr(self):
+        tensor_array = np.array(
+            [[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+             [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
+             [[19, 20, 21], [22, 23, 24], [25, 26, 27]]]).astype('float32')
+
+        var = paddle.to_tensor(tensor_array)
+
+        one = paddle.ones(shape=[1], dtype="int32")
+        two = paddle.full(shape=[1], fill_value=2, dtype="int32")
+        negative_one = paddle.full(shape=[1], fill_value=-1, dtype="int32")
+        four = paddle.full(shape=[1], fill_value=4, dtype="int32")
+
+        var = fluid.dygraph.to_variable(tensor_array)
+        var1 = var[0, one, one]
+        var2 = var[one:]
+        var3 = var[0:one]
+        var4 = var[::negative_one]
+        var5 = var[one, one:, one:]
+        var_reshape = fluid.layers.reshape(var, [3, negative_one, 3])
+        var6 = var_reshape[:, :, negative_one]
+        var7 = var[:, :, :negative_one]
+        var8 = var[:one, :one, :1]
+        var9 = var[:-1, :negative_one, :negative_one]
+        var10 = var[::negative_one, :one, :negative_one]
+        var11 = var[:negative_one, ::-1, negative_one:]
+        var12 = var[one:2, 2:, ::negative_one]
+        var13 = var[two:10, 2:, -2:negative_one]
+        var14 = var[1:negative_one, 0:2, ::negative_one]
+        var15 = var[::negative_one, ::-1, ::negative_one]
+        var16 = var[-4:4]
+
+        vars = [
+            var, var1, var2, var3, var4, var5, var6, var7, var8, var9, var10,
+            var11, var12, var13, var14, var15, var16
+        ]
+        local_out = [var.numpy() for var in vars]
+
+        self.assertTrue(np.array_equal(local_out[1], tensor_array[0, 1, 1:2]))
+        self.assertTrue(np.array_equal(local_out[2], tensor_array[1:]))
+        self.assertTrue(np.array_equal(local_out[3], tensor_array[0:1]))
+        self.assertTrue(np.array_equal(local_out[4], tensor_array[::-1]))
+        self.assertTrue(np.array_equal(local_out[5], tensor_array[1, 1:, 1:]))
+        self.assertTrue(
+            np.array_equal(local_out[6],
+                           tensor_array.reshape((3, -1, 3))[:, :, -1]))
+        self.assertTrue(np.array_equal(local_out[7], tensor_array[:, :, :-1]))
+        self.assertTrue(np.array_equal(local_out[8], tensor_array[:1, :1, :1]))
+        self.assertTrue(
+            np.array_equal(local_out[9], tensor_array[:-1, :-1, :-1]))
+        self.assertTrue(
+            np.array_equal(local_out[10], tensor_array[::-1, :1, :-1]))
+        self.assertTrue(
+            np.array_equal(local_out[11], tensor_array[:-1, ::-1, -1:]))
+        self.assertTrue(
+            np.array_equal(local_out[12], tensor_array[1:2, 2:, ::-1]))
+        self.assertTrue(
+            np.array_equal(local_out[13], tensor_array[2:10, 2:, -2:-1]))
+        self.assertTrue(
+            np.array_equal(local_out[14], tensor_array[1:-1, 0:2, ::-1]))
+        self.assertTrue(
+            np.array_equal(local_out[15], tensor_array[::-1, ::-1, ::-1]))
+        self.assertTrue(np.array_equal(local_out[16], tensor_array[-4:4]))
+
     def _test_for_var(self):
         np_value = np.random.random((30, 100, 100)).astype('float32')
         w = fluid.dygraph.to_variable(np_value)
@@ -483,6 +618,7 @@ def _test_for_var(self):
     def test_slice(self):
         with fluid.dygraph.guard():
             self._test_slice()
+            self._test_slice_for_tensor_attr()
             self._test_for_var()
 
             var = fluid.dygraph.to_variable(self.array)
@@ -502,6 +638,15 @@ def test_var_base_to_np(self):
                 np.array_equal(var.numpy(),
                                fluid.framework._var_base_to_np(var)))
 
+    def test_var_base_as_np(self):
+        with fluid.dygraph.guard():
+            var = fluid.dygraph.to_variable(self.array)
+            self.assertTrue(np.array_equal(var.numpy(), np.array(var)))
+            self.assertTrue(
+                np.array_equal(
+                    var.numpy(), np.array(
+                        var, dtype=np.float32)))
+
     def test_if(self):
         with fluid.dygraph.guard():
             var1 = fluid.dygraph.to_variable(np.array([[[0]]]))
@@ -622,6 +767,18 @@ def test_tensor_str_scaler(self):
         self.assertEqual(a_str, expected)
         paddle.enable_static()
 
+    def test_tensor_str_shape_with_zero(self):
+        paddle.disable_static(paddle.CPUPlace())
+        x = paddle.ones((10, 10))
+        y = paddle.fluid.layers.where(x == 0)
+        a_str = str(y)
+
+        expected = '''Tensor(shape=[0, 2], dtype=int64, place=CPUPlace, stop_gradient=True,
+       [])'''
+
+        self.assertEqual(a_str, expected)
+        paddle.enable_static()
+
     def test_print_tensor_dtype(self):
         paddle.disable_static(paddle.CPUPlace())
         a = paddle.rand([1])
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
index 110e7bb3cbf41c..3eefa0bce88636 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
@@ -85,11 +85,8 @@ def test_errors(self):
             # When the type of input is Variable, the dtype of input must be float16, float32, float64, int32, int64, bool.
             x3 = fluid.layers.data(name='x3', shape=[4], dtype="uint8")
             self.assertRaises(TypeError, fluid.layers.assign, x3)
-            # When the type of input is numpy.ndarray, the dtype of input must be float32, int32.
-            x4 = np.array([[2.5, 2.5]], dtype='float64')
+            x4 = np.array([[2.5, 2.5]], dtype='uint8')
             self.assertRaises(TypeError, fluid.layers.assign, x4)
-            x5 = np.array([[2.5, 2.5]], dtype='uint8')
-            self.assertRaises(TypeError, fluid.layers.assign, x5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
new file mode 100755
index 00000000000000..a27d806319cb26
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
@@ -0,0 +1,173 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+import paddle.fluid.core as core
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import random
+import sys
+
+sys.path.append("..")
+from op_test_xpu import XPUOpTest
+sys.path.append("../rnn")
+from rnn_numpy import SimpleRNN, LSTM, GRU
+from convert import get_params_for_net
+
+random.seed(2)
+np.set_printoptions(threshold=np.inf)
+paddle.enable_static()
+
+
+class TestRNNOp(XPUOpTest):
+    def init_size(self):
+        self.seq_length = 1
+        self.batch_size = 1
+        self.input_size = 5
+        self.hidden_size = 16
+
+    def get_weight_names(self):
+        weight_names = []
+        for i in range(self.num_layers):
+            for j in range(0, 2 * self.direction_num):
+                weight_names.append("{}.weight_{}".format(i, j))
+        for i in range(self.num_layers):
+            for j in range(0, 2 * self.direction_num):
+                weight_names.append("{}.bias_{}".format(i, j))
+        return weight_names
+
+    def setUp(self):
+        self.init_size()
+        self.op_type = "rnn"
+        self.dtype = np.float32
+        self.sequence_length = np.ones(
+            (self.batch_size, ), dtype=np.int32) * self.seq_length
+        self.num_layers = 1
+        self.is_bidirec = False
+        self.mode = "LSTM"
+        self.is_test = False
+        self.dropout = 0.0
+        self.set_attrs()
+
+        self.direction_num = 2 if self.is_bidirec else 1
+        direction = "bidirectional" if self.is_bidirec else "forward"
+
+        input = np.random.uniform(
+            low=-0.1,
+            high=0.1,
+            size=(self.seq_length, self.batch_size,
+                  self.input_size)).astype(self.dtype)
+
+        rnn1 = LSTM(
+            self.input_size,
+            self.hidden_size,
+            num_layers=self.num_layers,
+            time_major=True,
+            direction=direction,
+            dropout=self.dropout,
+            dtype="float32")
+
+        flat_w = get_params_for_net(rnn1)
+        output, (last_hidden, last_cell) = rnn1(
+            input, sequence_length=self.sequence_length)
+
+        init_h = np.zeros(
+            (self.num_layers * self.direction_num, self.batch_size,
+             self.hidden_size)).astype(self.dtype)
+        init_c = np.zeros(
+            (self.num_layers * self.direction_num, self.batch_size,
+             self.hidden_size)).astype(self.dtype)
+        state_out = np.ndarray((300)).astype("uint8")
+
+        self.inputs = {
+            'Input': input,
+            'WeightList': flat_w,
+            'PreState': [('init_h', init_h), ('init_c', init_c)],
+            'SequenceLength': self.sequence_length
+        }
+        if self.sequence_length is None:
+            self.inputs = {
+                'Input': input,
+                'WeightList': flat_w,
+                'PreState': [('init_h', init_h), ('init_c', init_c)],
+            }
+        self.attrs = {
+            'dropout_prob': self.dropout,
+            'is_bidirec': self.is_bidirec,
+            'input_size': self.input_size,
+            'hidden_size': self.hidden_size,
+            'num_layers': self.num_layers,
+            'mode': self.mode,
+            'is_test': self.is_test
+        }
+        self.outputs = {
+            'Out': output,
+            "State": [('last_hidden', last_hidden), ('last_cell', last_cell)],
+            'Reserve': np.ndarray((400)).astype("uint8"),
+            'DropoutState': state_out
+        }
+
+    def test_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(
+                place, atol=0.01, no_check_set=['Reserve', 'DropoutState'])
+
+    def set_attrs(self):
+        pass
+
+    def test_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            if not self.is_test:
+                var_name_list = self.get_weight_names()
+                grad_check_list = ['Input', 'init_h', 'init_c']
+                grad_check_list.extend(var_name_list)
+                self.check_grad_with_place(
+                    place,
+                    set(grad_check_list), ['Out', 'last_hidden', 'last_cell'],
+                    max_relative_error=0.1)
+
+
+class TestRNNOpCase0(TestRNNOp):
+    def init_size(self):
+        self.seq_length = 2
+        self.batch_size = 4
+        self.input_size = 10
+        self.hidden_size = 32
+
+
+class TestRNNOpCase1(TestRNNOp):
+    def init_size(self):
+        self.seq_length = 5
+        self.batch_size = 16
+        self.input_size = 30
+        self.hidden_size = 64
+
+
+class TestRNNOpCase2(TestRNNOp):
+    def init_size(self):
+        self.seq_length = 10
+        self.batch_size = 64
+        self.input_size = 50
+        self.hidden_size = 64
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py
index 752ec0672c216b..ef6975c3d241e5 100644
--- a/python/paddle/fluid/transpiler/collective.py
+++ b/python/paddle/fluid/transpiler/collective.py
@@ -17,6 +17,7 @@
 import sys
 import math
 from functools import reduce
+import os
 
 import collections
 import six
@@ -101,34 +102,64 @@ def _init_communicator(self, program, current_endpoint, endpoints, rank,
         nranks = len(endpoints)
         other_endpoints = endpoints[:]
         other_endpoints.remove(current_endpoint)
+        block = program.global_block()
+
         if rank == 0 and wait_port:
             wait_server_ready(other_endpoints)
 
         block = program.global_block()
-        nccl_id_var = block.create_var(
-            name=unique_name.generate('nccl_id'),
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-        block.append_op(
-            type='c_gen_nccl_id',
-            inputs={},
-            outputs={'Out': nccl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints,
-                self.op_role_key: OpRole.Forward
-            })
-        block.append_op(
-            type='c_comm_init',
-            inputs={'X': nccl_id_var},
-            outputs={},
-            attrs={
-                'nranks': nranks,
-                'rank': rank,
-                'ring_id': ring_id,
-                self.op_role_key: OpRole.Forward
-            })
+        if core.is_compiled_with_npu():
+            hccl_id_var = block.create_var(
+                name=unique_name.generate('hccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
+            block.append_op(
+                type='c_gen_hccl_id',
+                inputs={},
+                outputs={'Out': hccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    self.op_role_key: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init_hccl',
+                inputs={'X': hccl_id_var},
+                outputs={},
+                attrs={
+                    'rank': rank,
+                    'ring_id': ring_id,
+                    'device_id': int(os.getenv("FLAGS_selected_npus")),
+                    'rank_ids': nranks,
+                    self.op_role_key: OpRole.Forward
+                })
+        else:
+            nccl_id_var = block.create_var(
+                name=unique_name.generate('nccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            block.append_op(
+                type='c_gen_nccl_id',
+                inputs={},
+                outputs={'Out': nccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    self.op_role_key: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init',
+                inputs={'X': nccl_id_var},
+                outputs={},
+                attrs={
+                    'nranks': nranks,
+                    'rank': rank,
+                    'ring_id': ring_id,
+                    self.op_role_key: OpRole.Forward
+                })
 
     def _broadcast_params(self):
         block = self.startup_program.global_block()
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 5a616d81659b23..ce84fb739c0009 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -12,31 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: import framework api under this directory
-__all__ = [
-    'create_parameter', 'ParamAttr', 'CPUPlace', 'CUDAPlace', 'CUDAPinnedPlace',
-    'NPUPlace', 'get_default_dtype', 'set_default_dtype'
-]
+# TODO: import framework api under this directory 
 
-__all__ += ['grad', 'LayerList', 'load', 'save', 'no_grad', 'DataParallel']
+from . import random  # noqa: F401
+from .random import seed  # noqa: F401
+from .framework import get_default_dtype  # noqa: F401
+from .framework import set_default_dtype  # noqa: F401
+from .framework import set_grad_enabled  # noqa: F401
 
-from . import random
-from .random import seed
-from .framework import get_default_dtype
-from .framework import set_default_dtype
+from ..fluid.param_attr import ParamAttr  # noqa: F401
+from ..fluid.layers.tensor import create_parameter  # noqa: F401
+from ..fluid.core import CPUPlace  # noqa: F401
+from ..fluid.core import CUDAPlace  # noqa: F401
+from ..fluid.core import CUDAPinnedPlace  # noqa: F401
+from ..fluid.core import NPUPlace  # noqa: F401
+from ..fluid.core import VarBase  # noqa: F401
 
-from ..fluid.param_attr import ParamAttr  #DEFINE_ALIAS
-# from ..fluid.layers.tensor import create_global_var  #DEFINE_ALIAS
-from ..fluid.layers.tensor import create_parameter  #DEFINE_ALIAS
-from ..fluid.core import CPUPlace  #DEFINE_ALIAS
-from ..fluid.core import CUDAPlace  #DEFINE_ALIAS
-from ..fluid.core import CUDAPinnedPlace  #DEFINE_ALIAS
-from ..fluid.core import NPUPlace  #DEFINE_ALIAS
-from ..fluid.core import VarBase  #DEFINE_ALIAS
+from paddle.fluid import core  # noqa: F401
+from ..fluid.dygraph.base import no_grad_ as no_grad  # noqa: F401
+from ..fluid.dygraph.base import grad  # noqa: F401
+from .io import save  # noqa: F401
+from .io import load  # noqa: F401
+from ..fluid.dygraph.parallel import DataParallel  # noqa: F401
 
-from paddle.fluid import core  #DEFINE_ALIAS
-from ..fluid.dygraph.base import no_grad_ as no_grad  #DEFINE_ALIAS
-from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
-from .io import save
-from .io import load
-from ..fluid.dygraph.parallel import DataParallel  #DEFINE_ALIAS
+__all__ = []
diff --git a/python/paddle/framework/dtype.py b/python/paddle/framework/dtype.py
index 3eeaa6e74ecebd..f49f748975882d 100644
--- a/python/paddle/framework/dtype.py
+++ b/python/paddle/framework/dtype.py
@@ -12,11 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = [
-    "dtype", "uint8", "int8", "int16", "int32", "int64", "bfloat16", "float16",
-    "float32", "float64", "complex64", "complex128", "bool"
-]
-
 from ..fluid.core import VarDesc
 
 dtype = VarDesc.VarType
@@ -38,3 +33,5 @@
 complex128 = VarDesc.VarType.COMPLEX128
 
 bool = VarDesc.VarType.BOOL
+
+__all__ = []
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index 41ec18ce32d303..17eaa82cd8b6a0 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -15,9 +15,11 @@
 # TODO: define framework api 
 from paddle.fluid.layer_helper_base import LayerHelperBase
 from paddle.fluid.data_feeder import convert_dtype
+from paddle.fluid.framework import _dygraph_tracer
 import numpy as np
+from contextlib import contextmanager
 
-__all__ = ['set_default_dtype', 'get_default_dtype']
+__all__ = []
 
 
 def set_default_dtype(d):
@@ -80,3 +82,37 @@ def get_default_dtype():
             paddle.get_default_dtype()
     """
     return LayerHelperBase.get_default_dtype()
+
+
+@contextmanager
+def set_grad_enabled(mode):
+    """
+    :api_attr: imperative
+
+    Create a context which enables or disables dygraph gradient calculation.
+
+    Args:
+        mode(bool): whether to enable (`True`), or disable (`False`) grad.
+
+    Examples:
+        .. code-block:: python
+            x = paddle.ones([3, 2])
+            x.stop_gradient = False
+            with torch.set_grad_enabled(False):
+                y = x * 2
+                with torch.set_grad_enabled(True):
+                    z = x * 2
+            print(y.stop_gradient)   # True
+            print(z.stop_gradient)   # False
+    """
+
+    tracer = _dygraph_tracer()
+    if tracer:
+        prev_mode = tracer._has_grad
+        tracer._has_grad = mode
+        try:
+            yield
+        finally:
+            tracer._has_grad = prev_mode
+    else:
+        yield
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 3b953efab71c47..493574c5bef47a 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -33,15 +33,12 @@
 from paddle.fluid.io import _unpack_saved_dict, _pack_loaded_dict, _pickle_loads_mac
 from paddle.fluid.io import _legacy_save as _legacy_static_save
 
-from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, in_dygraph_mode, ParamBase, _current_expected_place
+from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, in_dygraph_mode, ParamBase, _current_expected_place, Program
 from paddle.fluid.dygraph.jit import _SaveLoadConfig
 from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
 
-__all__ = [
-    'save',
-    'load',
-]
+__all__ = []
 
 
 def _build_saved_state_dict(state_dict):
@@ -235,11 +232,6 @@ def _pickle_save(obj, f, protocol):
         raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
                          format(protocol))
 
-    if not isinstance(obj, (core.LoDTensor, core.VarBase)):
-        raise NotImplementedError(
-            "Support 'paddle.Tensor' or 'paddle.core.LoDTensor', but received {}.".
-            format(type(obj)))
-
     def reudce_varbase(self):
         data = self.numpy()
         name = self.name
@@ -287,11 +279,48 @@ def pop_dispatch_table():
             pickler.dump(obj)
 
 
-def _use_legacy(obj):
-    # TODO(weixin):If `obj` is any object, the judgment condition should be more precise.
-    if not isinstance(obj, dict):
+def _contain_x(obj, condition_func):
+    if isinstance(obj, core.SelectedRows):
+        raise NotImplementedError(
+            "`paddle.save` do not support saving 'SelectedRows'.")
+
+    if condition_func(obj):
+        return True
+    elif type(obj) in (dict, collections.OrderedDict, list, tuple):
+        if type(obj) in (dict, collections.OrderedDict):
+            keys = list(obj.keys())
+        else:
+            keys = range(len(obj))
+        flag = False
+        for key in keys:
+            flag |= _contain_x(obj[key], condition_func)
+            if flag:
+                return True
+        return flag
+    else:
         return False
-    return True
+
+
+def _is_state_dict(obj):
+    if isinstance(obj, dict):
+
+        def condition(obj):
+            return isinstance(obj, (core.Layer, Program, core.VarBase,
+                                    core.LoDTensor, core.SelectedRows))
+
+        # If the value of a dict is a core.VarBase/LoDTensor or a dict 
+        # that does not contain a paddle type(Layer, Program, VarBase, LoDTensor, SelectedRows), 
+        # the dict is considered to be a state_ dict.
+        for key, value in obj.items():
+            if isinstance(value, dict):
+                for k, v in value.items():
+                    if _contain_x(v, condition):
+                        return False
+            elif not isinstance(value, (core.VarBase, core.LoDTensor)):
+                return False
+        return True
+
+    return False
 
 
 def _transformed_from_varbase(obj):
@@ -348,12 +377,126 @@ def _ndarray_to_tensor(obj, return_numpy):
         return _to_LodTensor(obj)
 
 
+def _lod_tensor2varbase(tensor):
+    return_var = _varbase_creator()
+    return_var.value().get_tensor().set(tensor, _current_expected_place())
+    return return_var
+
+
+def _parse_every_object(obj, condition_func, convert_func):
+    if condition_func(obj):
+        return convert_func(obj)
+    elif type(obj) in (dict, collections.OrderedDict, list):
+        if type(obj) == list:
+            keys = range(len(obj))
+        else:
+            keys = list(obj.keys())
+        for key in keys:
+            if condition_func(obj[key]):
+                obj[key] = convert_func(obj[key])
+            else:
+                obj[key] = _parse_every_object(obj[key], condition_func,
+                                               convert_func)
+        return obj
+    elif type(obj) == tuple:
+        return tuple(
+            _parse_every_object(list(obj), condition_func, convert_func))
+    elif type(obj) == set:
+        return set(_parse_every_object(list(obj), condition_func, convert_func))
+    else:
+        if isinstance(obj, collections.Iterable) and not isinstance(obj, (
+                str, np.ndarray, core.VarBase, core.LoDTensor)):
+            raise NotImplementedError(
+                "The iteratable objects supported are tuple, list, dict, OrderedDict, string. But received {}.".
+                format(type(obj)))
+        return obj
+
+
+def _parse_load_result(obj, return_numpy):
+    def is_layer(obj):
+        return isinstance(obj, core.Layer)
+
+    def parse_layer(obj):
+        temp_dict = _parse_load_result(obj.__dict__, False)
+        obj.__dict__.update(temp_dict)
+        return obj
+
+    if _contain_x(obj, is_layer):
+        if not in_dygraph_mode():
+            raise ValueError(
+                "Layer can only be loaded in dynamic graph mode, but now in static graph mode."
+            )
+
+        _parse_every_object(obj, is_layer, parse_layer)
+
+    def tuple_to_tensor(obj):
+        return _tuple_to_tensor(obj, return_numpy=return_numpy)
+
+    def ndarray_to_tensor(obj):
+        return _ndarray_to_tensor(obj, return_numpy=return_numpy)
+
+    # tuple(name, ndarry) was converted from varbase of paddle2.1, 
+    # and all tuple(name, ndarry) are converted to tensor.
+    if _contain_x(obj, _transformed_from_varbase):
+        return _parse_every_object(obj, _transformed_from_varbase,
+                                   tuple_to_tensor)
+    # If there is no tuple(name, ndary), it is considered to be saved by paddle2.0 
+    # or converted from LoDTensor, and all ndarrays are converted to tensor.
+    else:
+        return _parse_every_object(obj, _transformed_from_lodtensor,
+                                   ndarray_to_tensor)
+
+
+def _save_lod_tensor(tensor, file_name):
+    if not tensor._is_initialized():
+        raise ValueError("The saved tensor is not initialized.")
+    _seek = core._save_lod_tensor(tensor, file_name)
+    # '_seek' is the end position of this tensor in the file.
+    return _seek
+
+
+def _load_lod_tensor(file_name):
+    temp_t = paddle.fluid.core.LoDTensor()
+    # '_seek' is the end position of this tensor in the file.
+    _seek = paddle.fluid.core._load_lod_tensor(temp_t, file_name)
+    return temp_t, _seek
+
+
+def _save_selected_rows(selected_rows, file_name):
+    # '_seek' is the end position of this SelectedRows in the file.
+    if not selected_rows.get_tensor()._is_initialized():
+        raise ValueError("The saved tensor is not initialized.")
+    _seek = core._save_selected_rows(selected_rows, file_name)
+    return _seek
+
+
+def _load_selected_rows(file_name):
+    temp_sr = core.SelectedRows()
+    # '_seek' is the end position of this SelectedRows in the file.
+    _seek = core._load_selected_rows(temp_sr, file_name)
+    return temp_sr, _seek
+
+
+def _save_binary_var(obj, path):
+    if isinstance(obj, core.LoDTensor):
+        _save_lod_tensor(obj, path)
+    elif isinstance(obj, core.SelectedRows):
+        _save_selected_rows(obj, path)
+    elif isinstance(obj, core.VarBase):
+        _save_lod_tensor(obj.value().get_tensor(), path)
+    else:
+        # Since the concept of 'Tensor' is only exposed to users, the error message can only contain tensor instead of 'LoDTensor' or 'SelectedRows'
+        raise NotImplementedError(
+            "When use_binary_format = True, `paddle.save`  expected Tensor, but received {}.".
+            format(type(obj)))
+
+
 def save(obj, path, protocol=2, **configs):
     '''
     Save an object to the specified path.
     
     .. note::
-        Now supports saving ``state_dict`` of Layer or Optimizer, Tensor.
+        Now supports saving ``state_dict`` of Layer/Optimizer, Layer, Tensor and nested structure containing Tensor.
 
     .. note::
         Different from ``paddle.jit.save``, since the save result of ``paddle.save`` is a single file, 
@@ -417,7 +560,7 @@ def save(obj, path, protocol=2, **configs):
             prog = paddle.static.default_main_program()
             for var in prog.list_vars():
                 if list(var.shape) == [224, 10]:
-                    tensor = var.get_tensor()
+                    tensor = var.get_value()
                     break
 
             # save/load tensor
@@ -447,22 +590,29 @@ def save(obj, path, protocol=2, **configs):
             "Type of `use_binary_format` should be bool, but received {}.".
             format(type(config.use_binary_format)))
 
-    # `protocol` need to be used, `pickle_protocol` is a deprecated arg.
-    if config.pickle_protocol is not None:
-        protocol = config.pickle_protocol
-        warnings.warn(
-            "'pickle_protocol' is a deprecated argument. Please use 'protocol' instead."
-        )
-
-    if _use_legacy(obj):
-        if in_dygraph_mode():
-            _legacy_save(obj, path, protocol)
-        else:
-            _legacy_static_save(obj, path, protocol)
+    if config.use_binary_format:
+        _save_binary_var(obj, path)
     else:
-        # save single variable
-        with open(path, 'wb') as f:
-            _pickle_save(obj, f, protocol)
+        # `protocol` need to be used, `pickle_protocol` is a deprecated arg.
+        if config.pickle_protocol is not None:
+            protocol = config.pickle_protocol
+            warnings.warn(
+                "'pickle_protocol' is a deprecated argument. Please use 'protocol' instead."
+            )
+
+        if isinstance(obj, Program):
+            obj.desc.flush()
+            with open(path, "wb") as f:
+                f.write(obj.desc.serialize_to_string())
+
+        elif _is_state_dict(obj):
+            if in_dygraph_mode():
+                _legacy_save(obj, path, protocol)
+            else:
+                _legacy_static_save(obj, path, protocol)
+        else:
+            with open(path, 'wb') as f:
+                _pickle_save(obj, f, protocol)
 
 
 def _legacy_save(obj, path, protocol=2):
@@ -517,7 +667,7 @@ def load(path, **configs):
     Load an object can be used in paddle from specified path.
 
     .. note::
-        Now supports load ``state_dict`` of Layer or Optimizer, Tensor.
+        Now supports loading ``state_dict`` of Layer/Optimizer, Layer, Tensor and nested structure containing Tensor.
 
     .. note::
         In order to use the model parameters saved by paddle more efficiently, 
@@ -610,7 +760,7 @@ def load(path, **configs):
             prog = paddle.static.default_main_program()
             for var in prog.list_vars():
                 if list(var.shape) == [224, 10]:
-                    tensor = var.get_tensor()
+                    tensor = var.get_value()
                     break
 
             # save/load tensor
@@ -627,46 +777,64 @@ def load(path, **configs):
 
     if os.path.isfile(path):
         config = _parse_load_config(configs)
-        with open(path, 'rb') as f:
-            # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
-            if sys.platform == 'darwin' and sys.version_info.major == 3:
-                load_result = _pickle_loads_mac(path, f)
-            else:
-                load_result = pickle.load(f) if six.PY2 else pickle.load(
-                    f, encoding='latin1')
+        if six.PY2:
+            exception_type = KeyError
+        else:
+            exception_type = pickle.UnpicklingError
+        try:
+            with open(path, 'rb') as f:
+                # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
+                if sys.platform == 'darwin' and sys.version_info.major == 3:
+                    load_result = _pickle_loads_mac(path, f)
+                else:
+                    load_result = pickle.load(f) if six.PY2 else pickle.load(
+                        f, encoding='latin1')
 
-            # TODO(weixin):If `obj` is any object, the judgment condition should be more precise.
-            if isinstance(load_result, dict):
+                # TODO(weixin):If `obj` is any object, the judgment condition should be more precise.
                 if isinstance(load_result, dict):
                     load_result = _pack_loaded_dict(load_result)
-                # paddle2.0: paddle.save/load
-                if "StructuredToParameterName@@" in load_result:
+                    # paddle2.0: paddle.save/load
+                    if "StructuredToParameterName@@" in load_result:
 
-                    for key in load_result["StructuredToParameterName@@"]:
-                        load_result[key] = _ndarray_to_tensor(
-                            load_result[key], config.return_numpy)
+                        for key in load_result["StructuredToParameterName@@"]:
+                            load_result[key] = _ndarray_to_tensor(
+                                load_result[key], config.return_numpy)
 
-                    if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
-                        del load_result["StructuredToParameterName@@"]
-                else:
-                    # paddle2.1 static.save/load
-                    for key in load_result:
-                        load_result[key] = _ndarray_to_tensor(
-                            load_result[key], config.return_numpy)
+                        if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
+                            del load_result["StructuredToParameterName@@"]
+                    else:
+                        # paddle2.1 static.save/load
+                        load_result = _parse_load_result(load_result,
+                                                         config.return_numpy)
 
-            else:
-                # TODO(weixin): support complex objects such as layer.
-                # If `obj` is any object, the judgment condition should be more precise.
-                if _transformed_from_lodtensor(load_result):
-                    load_result = _ndarray_to_tensor(load_result,
-                                                     config.return_numpy)
-                elif _transformed_from_varbase(load_result):
-                    load_result = _tuple_to_tensor(load_result,
-                                                   config.return_numpy)
                 else:
-                    raise NotImplementedError(
-                        'Only support tensor and state_dict, but received {}.'.
-                        format(type(load_result)))
+                    load_result = _parse_load_result(load_result,
+                                                     config.return_numpy)
+
+        except exception_type as msg_pickle:
+            try:
+                tensor, _ = _load_selected_rows(path)
+                return tensor
+            except:
+                try:
+                    tensor, _ = _load_lod_tensor(path)
+                    if config.return_numpy:
+                        return np.array(tensor)
+                    else:
+                        if in_dygraph_mode():
+                            return _lod_tensor2varbase(tensor)
+                        return tensor
+                except:
+                    try:
+                        with open(path, "rb") as f:
+                            program_desc_str = f.read()
+                            program = Program.parse_from_string(
+                                program_desc_str)
+                            return program
+                    except:
+                        raise ValueError(
+                            "`paddle.load` can not parse the file:{}.".format(
+                                path))
 
     else:
         load_result = _legacy_load(path, **configs)
diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py
index 1624a069a51eca..701f8b5352c3d4 100644
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -16,7 +16,7 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 
-__all__ = ['seed', 'get_cuda_rng_state', 'set_cuda_rng_state']
+__all__ = []
 
 
 def seed(seed):
@@ -83,7 +83,7 @@ def set_cuda_rng_state(state_list):
     Sets generator state for all cuda generators
 
     Args:
-        state_list(list): The cuda states to set back to cuda generators. state_list is obtained from get_cuda_rng_state().
+        state_list(list|tuple): The cuda states to set back to cuda generators. state_list is obtained from get_cuda_rng_state().
 
     Returns:
         None
diff --git a/python/paddle/hapi/__init__.py b/python/paddle/hapi/__init__.py
index 0aea557a28c274..2829bbe9470898 100644
--- a/python/paddle/hapi/__init__.py
+++ b/python/paddle/hapi/__init__.py
@@ -12,16 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import logger
-from . import callbacks
-from . import model_summary
+from . import logger  # noqa: F401
+from . import callbacks  # noqa: F401
+from . import hub  # noqa: F401
+from . import progressbar  # noqa: F401
+from . import static_flops  # noqa: F401
 
-from . import model
-from .model import *
-from .model_summary import summary
-from .dynamic_flops import flops
+from .model import Model  # noqa: F401
+from .model_summary import summary  # noqa: F401
+from .dynamic_flops import flops  # noqa: F401
 
 logger.setup_logger()
 
-__all__ = ['callbacks'] + model.__all__ + ['summary']
-__all__ = model.__all__ + ['flops']
+__all__ = []
diff --git a/python/paddle/hapi/dynamic_flops.py b/python/paddle/hapi/dynamic_flops.py
index 35819d6b7bb555..8be6758f1e54b5 100644
--- a/python/paddle/hapi/dynamic_flops.py
+++ b/python/paddle/hapi/dynamic_flops.py
@@ -18,7 +18,7 @@
 import numpy as np
 from .static_flops import static_flops, Table
 
-__all__ = ['flops']
+__all__ = []
 
 
 def flops(net, input_size, custom_ops=None, print_detail=False):
diff --git a/python/paddle/hapi/hub.py b/python/paddle/hapi/hub.py
new file mode 100644
index 00000000000000..54765c1d4d41cb
--- /dev/null
+++ b/python/paddle/hapi/hub.py
@@ -0,0 +1,279 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import sys
+import shutil
+import zipfile
+from paddle.utils.download import get_path_from_url
+
+__all__ = []
+
+DEFAULT_CACHE_DIR = '~/.cache'
+VAR_DEPENDENCY = 'dependencies'
+MODULE_HUBCONF = 'hubconf.py'
+HUB_DIR = os.path.expanduser(os.path.join('~', '.cache', 'paddle', 'hub'))
+
+
+def _remove_if_exists(path):
+    if os.path.exists(path):
+        if os.path.isfile(path):
+            os.remove(path)
+        else:
+            shutil.rmtree(path)
+
+
+def _import_module(name, repo_dir):
+    sys.path.insert(0, repo_dir)
+    try:
+        hub_module = __import__(name)
+        sys.modules.pop(name)
+    except ImportError:
+        sys.path.remove(repo_dir)
+        raise RuntimeError(
+            'Please make sure config exists or repo error messages above fixed when importing'
+        )
+
+    sys.path.remove(repo_dir)
+
+    return hub_module
+
+
+def _git_archive_link(repo_owner, repo_name, branch, source):
+    if source == 'github':
+        return 'https://github.com/{}/{}/archive/{}.zip'.format(
+            repo_owner, repo_name, branch)
+    elif source == 'gitee':
+        return 'https://gitee.com/{}/{}/repository/archive/{}.zip'.format(
+            repo_owner, repo_name, branch)
+
+
+def _parse_repo_info(repo, source):
+    branch = 'main' if source == 'github' else 'master'
+    if ':' in repo:
+        repo_info, branch = repo.split(':')
+    else:
+        repo_info = repo
+    repo_owner, repo_name = repo_info.split('/')
+    return repo_owner, repo_name, branch
+
+
+def _make_dirs(dirname):
+    try:
+        from pathlib import Path
+    except ImportError:
+        from pathlib2 import Path
+    Path(dirname).mkdir(exist_ok=True)
+
+
+def _get_cache_or_reload(repo, force_reload, verbose=True, source='github'):
+    # Setup hub_dir to save downloaded files
+    hub_dir = HUB_DIR
+
+    _make_dirs(hub_dir)
+
+    # Parse github/gitee repo information
+    repo_owner, repo_name, branch = _parse_repo_info(repo, source)
+    # Github allows branch name with slash '/',
+    # this causes confusion with path on both Linux and Windows.
+    # Backslash is not allowed in Github branch name so no need to
+    # to worry about it.
+    normalized_br = branch.replace('/', '_')
+    # Github renames folder repo/v1.x.x to repo-1.x.x
+    # We don't know the repo name before downloading the zip file
+    # and inspect name from it.
+    # To check if cached repo exists, we need to normalize folder names.
+    repo_dir = os.path.join(hub_dir,
+                            '_'.join([repo_owner, repo_name, normalized_br]))
+
+    use_cache = (not force_reload) and os.path.exists(repo_dir)
+
+    if use_cache:
+        if verbose:
+            sys.stderr.write('Using cache found in {}\n'.format(repo_dir))
+    else:
+        cached_file = os.path.join(hub_dir, normalized_br + '.zip')
+        _remove_if_exists(cached_file)
+
+        url = _git_archive_link(repo_owner, repo_name, branch, source=source)
+
+        get_path_from_url(url, hub_dir, decompress=False)
+
+        with zipfile.ZipFile(cached_file) as cached_zipfile:
+            extraced_repo_name = cached_zipfile.infolist()[0].filename
+            extracted_repo = os.path.join(hub_dir, extraced_repo_name)
+            _remove_if_exists(extracted_repo)
+            # Unzip the code and rename the base folder
+            cached_zipfile.extractall(hub_dir)
+
+        _remove_if_exists(cached_file)
+        _remove_if_exists(repo_dir)
+        # rename the repo
+        shutil.move(extracted_repo, repo_dir)
+
+    return repo_dir
+
+
+def _load_entry_from_hubconf(m, name):
+    '''load entry from hubconf
+    '''
+    if not isinstance(name, str):
+        raise ValueError(
+            'Invalid input: model should be a str of function name')
+
+    func = getattr(m, name, None)
+
+    if func is None or not callable(func):
+        raise RuntimeError('Cannot find callable {} in hubconf'.format(name))
+
+    return func
+
+
+def _check_module_exists(name):
+    try:
+        __import__(name)
+        return True
+    except ImportError:
+        return False
+
+
+def _check_dependencies(m):
+    dependencies = getattr(m, VAR_DEPENDENCY, None)
+
+    if dependencies is not None:
+        missing_deps = [
+            pkg for pkg in dependencies if not _check_module_exists(pkg)
+        ]
+        if len(missing_deps):
+            raise RuntimeError('Missing dependencies: {}'.format(', '.join(
+                missing_deps)))
+
+
+def list(repo_dir, source='github', force_reload=False):
+    r"""
+    List all entrypoints available in `github` hubconf.
+
+    Args:
+        repo_dir(str): github or local path
+            github path (str): a str with format "repo_owner/repo_name[:tag_name]" with an optional
+                tag/branch. The default branch is `main` if not specified.
+            local path (str): local repo path
+        source (str): `github` | `gitee` | `local`, default is `github`
+        force_reload (bool, optional): whether to discard the existing cache and force a fresh download, default is `False`.
+    Returns:
+        entrypoints: a list of available entrypoint names
+
+    Example:
+        ```python
+        import paddle
+
+        paddle.hub.list('lyuwenyu/paddlehub_demo:main', source='github', force_reload=False)
+
+        ```
+    """
+    if source not in ('github', 'gitee', 'local'):
+        raise ValueError(
+            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.
+            format(source))
+
+    if source in ('github', 'gitee'):
+        repo_dir = _get_cache_or_reload(
+            repo_dir, force_reload, True, source=source)
+
+    hub_module = _import_module(MODULE_HUBCONF.split('.')[0], repo_dir)
+
+    entrypoints = [
+        f for f in dir(hub_module)
+        if callable(getattr(hub_module, f)) and not f.startswith('_')
+    ]
+
+    return entrypoints
+
+
+def help(repo_dir, model, source='github', force_reload=False):
+    """
+    Show help information of model
+
+    Args:
+        repo_dir(str): github or local path
+            github path (str): a str with format "repo_owner/repo_name[:tag_name]" with an optional
+                tag/branch. The default branch is `main` if not specified.
+            local path (str): local repo path
+        model (str): model name
+        source (str): `github` | `gitee` | `local`, default is `github`
+        force_reload (bool, optional): default is `False`
+    Return:
+        docs
+
+    Example:
+        ```python
+        import paddle
+
+        paddle.hub.help('lyuwenyu/paddlehub_demo:main', model='MM', source='github')
+        ```
+    """
+    if source not in ('github', 'gitee', 'local'):
+        raise ValueError(
+            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.
+            format(source))
+
+    if source in ('github', 'gitee'):
+        repo_dir = _get_cache_or_reload(
+            repo_dir, force_reload, True, source=source)
+
+    hub_module = _import_module(MODULE_HUBCONF.split('.')[0], repo_dir)
+
+    entry = _load_entry_from_hubconf(hub_module, model)
+
+    return entry.__doc__
+
+
+def load(repo_dir, model, source='github', force_reload=False, **kwargs):
+    """
+    Load model
+
+    Args:
+        repo_dir(str): github or local path
+            github path (str): a str with format "repo_owner/repo_name[:tag_name]" with an optional
+                tag/branch. The default branch is `main` if not specified.
+            local path (str): local repo path
+        model (str): model name
+        source (str): `github` | `gitee` | `local`, default is `github`
+        force_reload (bool, optional), default is `False`
+        **kwargs: parameters using for model
+    Return:
+        paddle model
+    Example:
+        ```python
+        import paddle
+        paddle.hub.load('lyuwenyu/paddlehub_demo:main', model='MM', source='github')
+        ```
+    """
+    if source not in ('github', 'gitee', 'local'):
+        raise ValueError(
+            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.
+            format(source))
+
+    if source in ('github', 'gitee'):
+        repo_dir = _get_cache_or_reload(
+            repo_dir, force_reload, True, source=source)
+
+    hub_module = _import_module(MODULE_HUBCONF.split('.')[0], repo_dir)
+
+    _check_dependencies(hub_module)
+
+    entry = _load_entry_from_hubconf(hub_module, model)
+
+    return entry(**kwargs)
diff --git a/python/paddle/hapi/logger.py b/python/paddle/hapi/logger.py
index d4f18ce0ff738c..ea515d95324675 100644
--- a/python/paddle/hapi/logger.py
+++ b/python/paddle/hapi/logger.py
@@ -22,6 +22,8 @@
 
 from paddle.fluid.dygraph.parallel import ParallelEnv
 
+__all__ = []
+
 
 def setup_logger(output=None, name="hapi", log_level=logging.INFO):
     """
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 4f3d73b22e3902..160d6c54759d90 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -54,7 +54,7 @@
 from .callbacks import config_callbacks, EarlyStopping
 from .model_summary import summary
 
-__all__ = ['Model', ]
+__all__ = []
 
 _parallel_context_initialized = False
 
@@ -133,33 +133,59 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
         return
     other_endpoints = endpoints[:]
     other_endpoints.remove(current_endpoint)
+    block = program.global_block()
     if rank == 0 and wait_port:
         wait_server_ready(other_endpoints)
-    block = program.global_block()
-    nccl_id_var = block.create_var(
-        name=fluid.unique_name.generate('nccl_id'),
-        persistable=True,
-        type=fluid.core.VarDesc.VarType.RAW)
-
-    block.append_op(
-        type='c_gen_nccl_id',
-        inputs={},
-        outputs={'Out': nccl_id_var},
-        attrs={
-            'rank': rank,
-            'endpoint': current_endpoint,
-            'other_endpoints': other_endpoints
-        })
-
-    block.append_op(
-        type='c_comm_init',
-        inputs={'X': nccl_id_var},
-        outputs={},
-        attrs={
-            'nranks': nranks,
-            'rank': rank,
-            'ring_id': 0,
-        })
+    if core.is_compiled_with_cuda():
+        nccl_id_var = block.create_var(
+            name=fluid.unique_name.generate('nccl_id'),
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.RAW)
+
+        block.append_op(
+            type='c_gen_nccl_id',
+            inputs={},
+            outputs={'Out': nccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints
+            })
+
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': nccl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': 0,
+            })
+    elif core.is_compiled_with_npu():
+        hccl_id_var = block.create_var(
+            name=unique_name.generate('hccl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+        endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
+        block.append_op(
+            type='c_gen_hccl_id',
+            inputs={},
+            outputs={'Out': hccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints
+            })
+        block.append_op(
+            type='c_comm_init_hccl',
+            inputs={'X': hccl_id_var},
+            outputs={},
+            attrs={
+                'rank': rank,
+                'ring_id': 0,
+                'device_id': int(os.getenv("FLAGS_selected_npus")),
+                'rank_ids': nranks
+            })
 
 
 def prepare_distributed_context(place=None):
@@ -210,7 +236,7 @@ def _update_input_info(inputs):
     if isinstance(inputs, Input):
         shapes = [list(inputs.shape)]
         dtypes = [inputs.dtype]
-    elif isinstance(inputs, list):
+    elif isinstance(inputs, (list, tuple)):
         shapes = [list(input.shape) for input in inputs]
         dtypes = [input.dtype for input in inputs]
     elif isinstance(inputs, dict):
@@ -861,20 +887,20 @@ class Model(object):
     AdamW and Momentum optimizer. Before using pure float16 training,
     `multi_precision` could be set to True when creating optimizer, which can
     avoid poor accuracy or slow convergence in a way, and inputs of dtype float
-    should be cast to float16 by users. Users should also use
-    `paddle.static.amp.fp16_guard` API to limit the range of pure float16
-    training, otherwise, 'use_fp16_guard' should be set to False by users.
-    However, limiting the range of is not supported during training using AMP.
+    should be cast to float16 by users. `paddle.static.amp.fp16_guard` API
+    should be also used to limit the range of pure float16 training, otherwise,
+    'use_fp16_guard' should be set to False by users. However, limiting the
+    range of is not supported during training using AMP.
 
     Args:
         network (paddle.nn.Layer): The network is an instance of
             paddle.nn.Layer.
-        inputs (InputSpec|list|dict|None): `inputs`, entry points of network,
-            could be a InputSpec instance, or lits of InputSpec instances,
+        inputs (InputSpec|list|tuple|dict|None): `inputs`, entry points of network,
+            could be a InputSpec instance, or list/tuple of InputSpec instances,
             or dict ({name: InputSpec}), and it couldn't be None in static
             graph.
-        labels (InputSpec|list|None): `labels`, entry points of network,
-            could be a InputSpec instnace or lits of InputSpec instances,
+        labels (InputSpec|list|tuple|None): `labels`, entry points of network,
+            could be a InputSpec instnace or list/tuple of InputSpec instances,
             or None. For static graph, if labels is required in loss,
             labels must be set. Otherwise, it could be None.
 
@@ -948,7 +974,7 @@ def run_example_code():
             data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
             model.fit(data, epochs=2, batch_size=32, verbose=1)
 
-          # mixed precision training is only support on GPU now.
+          # mixed precision training is only supported on GPU now.
           if paddle.is_compiled_with_cuda():
             run_example_code()
 
@@ -968,9 +994,10 @@ def __init__(self, network, inputs=None, labels=None):
         self.stop_training = False
 
         if not in_dygraph_mode():
-            if not isinstance(inputs, (list, dict, Input)):
+            if not isinstance(inputs, (list, tuple, dict, Input)):
                 raise TypeError(
-                    "'inputs' must be list or dict, and couldn't be None.")
+                    "'inputs' must be list or tuple or dict, and couldn't be None."
+                )
         elif inputs:
             self._input_info = _update_input_info(inputs)
 
@@ -1436,19 +1463,18 @@ def prepare(self, optimizer=None, loss=None, metrics=None,
                 float16 training is used, the key 'level' of 'amp_configs'
                 should be set to 'O1' or 'O2' respectively. Otherwise, the
                 value of 'level' defaults to 'O0', which means float32
-                training. In addition to 'level', users could pass in more
-                parameters consistent with mixed precision API. The supported
+                training. In addition to 'level', parameters consistent with
+                mixed precision API could also be passed in. The supported
                 keys are: 'init_loss_scaling', 'incr_ratio', 'decr_ratio',
                 'incr_every_n_steps', 'decr_every_n_nan_or_inf',
                 'use_dynamic_loss_scaling', 'custom_white_list',
                 'custom_black_list', and 'custom_black_varnames'or
-                'use_fp16_guard' is only supported in static mode. Users could
-                refer to mixed precision API documentations
-                 :ref:`api_paddle_amp_auto_cast` and
-                 :ref:`api_paddle_amp_GradScaler` for details. For convenience,
-                'amp_configs' could be set to 'O1' or 'O2' if no more
-                parameters are needed. 'amp_configs' could be None in float32
-                training. Default: None.
+                'use_fp16_guard' is only supported in static mode. Mixed
+                precision API documentations  :ref:`api_paddle_amp_auto_cast`
+                and  :ref:`api_paddle_amp_GradScaler` could be referenced
+                for details. For convenience, 'amp_configs' could be set to
+                'O1' or 'O2' if no more parameters are needed. 'amp_configs'
+                could be None in float32 training. Default: None.
         Returns:
             None
         """
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index 9f2769e1ca2857..d78196d94451ed 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -22,7 +22,7 @@
 
 from collections import OrderedDict
 
-__all__ = ['summary']
+__all__ = []
 
 
 def summary(net, input_size, dtypes=None):
diff --git a/python/paddle/hapi/progressbar.py b/python/paddle/hapi/progressbar.py
index cf5a03ed4982b4..5f63a3169f8ac7 100644
--- a/python/paddle/hapi/progressbar.py
+++ b/python/paddle/hapi/progressbar.py
@@ -22,7 +22,7 @@
 import numpy as np
 from collections import namedtuple
 
-__all__ = ['ProgressBar']
+__all__ = []
 
 
 class ProgressBar(object):
diff --git a/python/paddle/hapi/static_flops.py b/python/paddle/hapi/static_flops.py
index 3656e0c18945a6..07fc19b2cb89a5 100644
--- a/python/paddle/hapi/static_flops.py
+++ b/python/paddle/hapi/static_flops.py
@@ -18,6 +18,8 @@
 from collections import OrderedDict
 from paddle.static import Program, program_guard, Variable
 
+__all__ = []
+
 
 class VarWrapper(object):
     def __init__(self, var, graph):
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index 662515f0e52b1e..03e5a88624086b 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -13,9 +13,9 @@
 # limitations under the License.
 
 from . import optimizer
-from ..fluid.contrib import reader
+from . import checkpoint
 from ..fluid.layer_helper import LayerHelper
 
 __all__ = []
-__all__ += ["reader"]
 __all__ += optimizer.__all__
+__all__ += checkpoint.__all__
diff --git a/python/paddle/fluid/contrib/reader/__init__.py b/python/paddle/incubate/checkpoint/__init__.py
similarity index 74%
rename from python/paddle/fluid/contrib/reader/__init__.py
rename to python/paddle/incubate/checkpoint/__init__.py
index 32054d1421a27e..7ddd256df74798 100644
--- a/python/paddle/fluid/contrib/reader/__init__.py
+++ b/python/paddle/incubate/checkpoint/__init__.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
+from ...fluid.incubate.checkpoint import auto_checkpoint
 
-from .distributed_reader import *
-
-__all__ = []
-__all__ += distributed_reader.__all__
+__all__ = ["auto_checkpoint"]
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index 59e2729941e416..5781f78c6e4e4a 100755
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -13,26 +13,37 @@
 # limitations under the License.
 
 # TODO: define all functions about input & output in this directory 
-__all__ = [
-    'Dataset',
-    'IterableDataset',
-    'TensorDataset',
-    'ComposeDataset',
-    'ChainDataset',
-    'BatchSampler',
-    'DistributedBatchSampler',
-    #            'Transform',
-    'DataLoader',
-    'get_worker_info',
-    'Sampler',
-    'SequenceSampler',
-    'RandomSampler',
-    'WeightedRandomSampler',
-    'random_split',
-    'Subset'
-]
 
-from ..fluid.io import DataLoader
-from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worker_info, \
-        TensorDataset, Sampler, SequenceSampler, RandomSampler, DistributedBatchSampler, \
-        ComposeDataset, ChainDataset, WeightedRandomSampler, Subset, random_split
+from ..fluid.io import DataLoader  # noqa: F401
+from ..fluid.dataloader import Dataset  # noqa: F401
+from ..fluid.dataloader import IterableDataset  # noqa: F401
+from ..fluid.dataloader import BatchSampler  # noqa: F401
+from ..fluid.dataloader import get_worker_info  # noqa: F401
+from ..fluid.dataloader import TensorDataset  # noqa: F401
+from ..fluid.dataloader import Sampler  # noqa: F401
+from ..fluid.dataloader import SequenceSampler  # noqa: F401
+from ..fluid.dataloader import RandomSampler  # noqa: F401
+from ..fluid.dataloader import DistributedBatchSampler  # noqa: F401
+from ..fluid.dataloader import ComposeDataset  # noqa: F401
+from ..fluid.dataloader import ChainDataset  # noqa: F401
+from ..fluid.dataloader import WeightedRandomSampler  # noqa: F401
+from ..fluid.dataloader import Subset  # noqa: F401
+from ..fluid.dataloader import random_split  # noqa: F401
+
+__all__ = [ #noqa
+           'Dataset',
+           'IterableDataset',
+           'TensorDataset',
+           'ComposeDataset',
+           'ChainDataset',
+           'BatchSampler',
+           'DistributedBatchSampler',
+           'DataLoader',
+           'get_worker_info',
+           'Sampler',
+           'SequenceSampler',
+           'RandomSampler',
+           'WeightedRandomSampler',
+           'random_split',
+           'Subset'
+]
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 0784775b6695ee..61d1eb0e373341 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -182,7 +182,7 @@ class Accuracy(Metric):
     Encapsulates accuracy metric logic.
 
     Args:
-        topk (int|tuple(int)): Number of top elements to look at
+        topk (int|list[int]|tuple[int]): Number of top elements to look at
             for computing accuracy. Default is (1,).
         name (str, optional): String name of the metric instance. Default
             is `acc`.
@@ -243,7 +243,7 @@ def __init__(self, topk=(1, ), name=None, *args, **kwargs):
 
     def compute(self, pred, label, *args):
         """
-        Compute the top-k (maxinum value in `topk`) indices.
+        Compute the top-k (maximum value in `topk`) indices.
 
         Args:
             pred (Tensor): The predicted value is a Tensor with dtype
@@ -253,7 +253,7 @@ def compute(self, pred, label, *args):
                 [batch_size, d0, ..., num_classes] in one hot representation.
                 
         Return:
-            Tensor: Correct mask, a tensor with shape [batch_size, topk].
+            Tensor: Correct mask, a tensor with shape [batch_size, d0, ..., topk].
         """
         pred = paddle.argsort(pred, descending=True)
         pred = paddle.slice(
@@ -277,7 +277,7 @@ def update(self, correct, *args):
         returns the accuracy of current step.
         
         Args:
-            correct: Correct mask, a tensor with shape [batch_size, topk].
+            correct: Correct mask, a tensor with shape [batch_size, d0, ..., topk].
 
         Return:
             Tensor: the accuracy of current step.
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 79f21aadae69a5..c4f4b6cbc1f7c8 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -15,146 +15,278 @@
 # TODO: import all neural network related api under this directory,
 # including layers, linear, conv, rnn etc.
 
-from .layer import norm
-from .functional import extension
-from .layer import common
-from .layer import rnn
-from .utils import weight_norm_hook
-
-from . import initializer
-
-__all__ = []
-__all__ += norm.__all__
-__all__ += extension.__all__
-__all__ += common.__all__
-__all__ += rnn.__all__
-__all__ += weight_norm_hook.__all__
-
-# TODO: define alias in nn directory
-from .clip import ClipGradByGlobalNorm  #DEFINE_ALIAS
-from .clip import ClipGradByNorm  #DEFINE_ALIAS
-from .clip import ClipGradByValue  #DEFINE_ALIAS
-# from .control_flow import cond  #DEFINE_ALIAS
-# from .control_flow import DynamicRNN        #DEFINE_ALIAS
-# from .control_flow import StaticRNN        #DEFINE_ALIAS
-# from .control_flow import while_loop  #DEFINE_ALIAS
-# from .control_flow import rnn        #DEFINE_ALIAS
-from .decode import BeamSearchDecoder  #DEFINE_ALIAS
-from .decode import dynamic_decode  #DEFINE_ALIAS
-# from .decode import Decoder        #DEFINE_ALIAS
-# from .decode import crf_decoding        #DEFINE_ALIAS
-# from .decode import ctc_greedy_decoder        #DEFINE_ALIAS
-# from .input import Input        #DEFINE_ALIAS
-from .layer.activation import ELU  #DEFINE_ALIAS
-from .layer.activation import GELU  #DEFINE_ALIAS
-from .layer.activation import Tanh  #DEFINE_ALIAS
-from .layer.activation import Hardshrink  #DEFINE_ALIAS
-from .layer.activation import Hardswish  #DEFINE_ALIAS
-from .layer.activation import Hardtanh  #DEFINE_ALIAS
-from .layer.activation import PReLU  #DEFINE_ALIAS
-from .layer.activation import ReLU  #DEFINE_ALIAS
-from .layer.activation import ReLU6  #DEFINE_ALIAS
-from .layer.activation import SELU  #DEFINE_ALIAS
-from .layer.activation import LeakyReLU  #DEFINE_ALIAS
-from .layer.activation import Sigmoid  #DEFINE_ALIAS
-from .layer.activation import Hardsigmoid  #DEFINE_ALIAS
-from .layer.activation import LogSigmoid  #DEFINE_ALIAS
-from .layer.activation import Softmax  #DEFINE_ALIAS
-from .layer.activation import Softplus  #DEFINE_ALIAS
-from .layer.activation import Softshrink  #DEFINE_ALIAS
-from .layer.activation import Softsign  #DEFINE_ALIAS
-from .layer.activation import Swish  #DEFINE_ALIAS
-from .layer.activation import Tanhshrink  #DEFINE_ALIAS
-from .layer.activation import ThresholdedReLU  #DEFINE_ALIAS
-from .layer.activation import LogSoftmax  #DEFINE_ALIAS
-from .layer.activation import Maxout  #DEFINE_ALIAS
-from .layer.common import Pad1D  #DEFINE_ALIAS
-from .layer.common import Pad2D  #DEFINE_ALIAS
-from .layer.common import Pad3D  #DEFINE_ALIAS
-from .layer.common import CosineSimilarity  #DEFINE_ALIAS
-from .layer.common import Embedding  #DEFINE_ALIAS
-from .layer.common import Linear  #DEFINE_ALIAS
-from .layer.common import Flatten  #DEFINE_ALIAS
-from .layer.common import Upsample  #DEFINE_ALIAS
-from .layer.common import UpsamplingNearest2D  #DEFINE_ALIAS
-from .layer.common import UpsamplingBilinear2D  #DEFINE_ALIAS
-from .layer.common import Bilinear  #DEFINE_ALIAS
-from .layer.common import Dropout  #DEFINE_ALIAS
-from .layer.common import Dropout2D  #DEFINE_ALIAS
-from .layer.common import Dropout3D  #DEFINE_ALIAS
-from .layer.common import AlphaDropout  #DEFINE_ALIAS
-
-from .layer.pooling import AvgPool1D  #DEFINE_ALIAS
-from .layer.pooling import AvgPool2D  #DEFINE_ALIAS
-from .layer.pooling import AvgPool3D  #DEFINE_ALIAS
-from .layer.pooling import MaxPool1D  #DEFINE_ALIAS
-from .layer.pooling import MaxPool2D  #DEFINE_ALIAS
-from .layer.pooling import MaxPool3D  #DEFINE_ALIAS
-from .layer.pooling import AdaptiveAvgPool1D  #DEFINE_ALIAS
-from .layer.pooling import AdaptiveAvgPool2D  #DEFINE_ALIAS
-from .layer.pooling import AdaptiveAvgPool3D  #DEFINE_ALIAS
-
-from .layer.pooling import AdaptiveMaxPool1D  #DEFINE_ALIAS
-from .layer.pooling import AdaptiveMaxPool2D  #DEFINE_ALIAS
-from .layer.pooling import AdaptiveMaxPool3D  #DEFINE_ALIAS
-from .layer.conv import Conv1D  #DEFINE_ALIAS
-from .layer.conv import Conv2D  #DEFINE_ALIAS
-from .layer.conv import Conv3D  #DEFINE_ALIAS
-from .layer.conv import Conv1DTranspose  #DEFINE_ALIAS
-from .layer.conv import Conv2DTranspose  #DEFINE_ALIAS
-from .layer.conv import Conv3DTranspose  #DEFINE_ALIAS
-# from .layer.conv import TreeConv        #DEFINE_ALIAS
-# from .layer.conv import Conv1D        #DEFINE_ALIAS
-from .layer.common import Linear
-# from .layer.loss import NCELoss        #DEFINE_ALIAS
-from .layer.loss import BCEWithLogitsLoss  #DEFINE_ALIAS
-from .layer.loss import CrossEntropyLoss  #DEFINE_ALIAS
-from .layer.loss import HSigmoidLoss  #DEFINE_ALIAS
-from .layer.loss import MSELoss  #DEFINE_ALIAS
-from .layer.loss import L1Loss  #DEFINE_ALIAS
-from .layer.loss import NLLLoss  #DEFINE_ALIAS
-from .layer.loss import BCELoss  #DEFINE_ALIAS
-from .layer.loss import KLDivLoss  #DEFINE_ALIAS
-from .layer.loss import MarginRankingLoss  #DEFINE_ALIAS
-from .layer.loss import CTCLoss  #DEFINE_ALIAS
-from .layer.loss import SmoothL1Loss  #DEFINE_ALIAS
-from .layer.norm import BatchNorm  #DEFINE_ALIAS
-from .layer.norm import SyncBatchNorm  #DEFINE_ALIAS
-from .layer.norm import GroupNorm  #DEFINE_ALIAS
-from .layer.norm import LayerNorm  #DEFINE_ALIAS
-from .layer.norm import SpectralNorm  #DEFINE_ALIAS
-from .layer.norm import InstanceNorm1D  #DEFINE_ALIAS
-from .layer.norm import InstanceNorm2D  #DEFINE_ALIAS
-from .layer.norm import InstanceNorm3D  #DEFINE_ALIAS
-from .layer.norm import BatchNorm1D  #DEFINE_ALIAS
-from .layer.norm import BatchNorm2D  #DEFINE_ALIAS
-from .layer.norm import BatchNorm3D  #DEFINE_ALIAS
-from .layer.norm import LocalResponseNorm  #DEFINE_ALIAS
-
-from .layer.rnn import RNNCellBase  #DEFINE_ALIAS
-from .layer.rnn import SimpleRNNCell  #DEFINE_ALIAS
-from .layer.rnn import LSTMCell  #DEFINE_ALIAS
-from .layer.rnn import GRUCell  #DEFINE_ALIAS
-from .layer.rnn import RNN  #DEFINE_ALIAS
-from .layer.rnn import BiRNN  #DEFINE_ALIAS
-from .layer.rnn import SimpleRNN  #DEFINE_ALIAS
-from .layer.rnn import LSTM  #DEFINE_ALIAS
-from .layer.rnn import GRU  #DEFINE_ALIAS
-
-from .layer.transformer import MultiHeadAttention
-from .layer.transformer import TransformerEncoderLayer
-from .layer.transformer import TransformerEncoder
-from .layer.transformer import TransformerDecoderLayer
-from .layer.transformer import TransformerDecoder
-from .layer.transformer import Transformer
-from .layer.distance import PairwiseDistance  #DEFINE_ALIAS
-
-from .layer.vision import PixelShuffle
-
-from .layer.container import LayerDict  #DEFINE_ALIAS
-
-from .layer import loss  #DEFINE_ALIAS
-from .layer import conv  #DEFINE_ALIAS
-from .layer import vision  #DEFINE_ALIAS
-from ..fluid.dygraph.layers import Layer  #DEFINE_ALIAS
-from ..fluid.dygraph.container import LayerList, ParameterList, Sequential  #DEFINE_ALIAS
+from .clip import ClipGradByGlobalNorm  # noqa: F401
+from .clip import ClipGradByNorm  # noqa: F401
+from .clip import ClipGradByValue  # noqa: F401
+from .decode import BeamSearchDecoder  # noqa: F401
+from .decode import dynamic_decode  # noqa: F401
+from .layer.activation import ELU  # noqa: F401
+from .layer.activation import GELU  # noqa: F401
+from .layer.activation import Tanh  # noqa: F401
+from .layer.activation import Hardshrink  # noqa: F401
+from .layer.activation import Hardswish  # noqa: F401
+from .layer.activation import Hardtanh  # noqa: F401
+from .layer.activation import PReLU  # noqa: F401
+from .layer.activation import ReLU  # noqa: F401
+from .layer.activation import ReLU6  # noqa: F401
+from .layer.activation import SELU  # noqa: F401
+from .layer.activation import Silu  # noqa: F401
+from .layer.activation import LeakyReLU  # noqa: F401
+from .layer.activation import Sigmoid  # noqa: F401
+from .layer.activation import Hardsigmoid  # noqa: F401
+from .layer.activation import LogSigmoid  # noqa: F401
+from .layer.activation import Softmax  # noqa: F401
+from .layer.activation import Softplus  # noqa: F401
+from .layer.activation import Softshrink  # noqa: F401
+from .layer.activation import Softsign  # noqa: F401
+from .layer.activation import Swish  # noqa: F401
+from .layer.activation import Tanhshrink  # noqa: F401
+from .layer.activation import ThresholdedReLU  # noqa: F401
+from .layer.activation import LogSoftmax  # noqa: F401
+from .layer.activation import Maxout  # noqa: F401
+from .layer.common import Pad1D  # noqa: F401
+from .layer.common import Pad2D  # noqa: F401
+from .layer.common import Pad3D  # noqa: F401
+from .layer.common import CosineSimilarity  # noqa: F401
+from .layer.common import Embedding  # noqa: F401
+from .layer.common import Linear  # noqa: F401
+from .layer.common import Flatten  # noqa: F401
+from .layer.common import Upsample  # noqa: F401
+from .layer.common import UpsamplingNearest2D  # noqa: F401
+from .layer.common import UpsamplingBilinear2D  # noqa: F401
+from .layer.common import Bilinear  # noqa: F401
+from .layer.common import Dropout  # noqa: F401
+from .layer.common import Dropout2D  # noqa: F401
+from .layer.common import Dropout3D  # noqa: F401
+from .layer.common import AlphaDropout  # noqa: F401
+from .layer.common import Unfold  # noqa: F401
+
+from .layer.pooling import AvgPool1D  # noqa: F401
+from .layer.pooling import AvgPool2D  # noqa: F401
+from .layer.pooling import AvgPool3D  # noqa: F401
+from .layer.pooling import MaxPool1D  # noqa: F401
+from .layer.pooling import MaxPool2D  # noqa: F401
+from .layer.pooling import MaxPool3D  # noqa: F401
+from .layer.pooling import AdaptiveAvgPool1D  # noqa: F401
+from .layer.pooling import AdaptiveAvgPool2D  # noqa: F401
+from .layer.pooling import AdaptiveAvgPool3D  # noqa: F401
+from .layer.pooling import AdaptiveMaxPool1D  # noqa: F401
+from .layer.pooling import AdaptiveMaxPool2D  # noqa: F401
+from .layer.pooling import AdaptiveMaxPool3D  # noqa: F401
+
+from .layer.conv import Conv1D  # noqa: F401
+from .layer.conv import Conv2D  # noqa: F401
+from .layer.conv import Conv3D  # noqa: F401
+from .layer.conv import Conv1DTranspose  # noqa: F401
+from .layer.conv import Conv2DTranspose  # noqa: F401
+from .layer.conv import Conv3DTranspose  # noqa: F401
+
+from .layer.loss import BCEWithLogitsLoss  # noqa: F401
+from .layer.loss import CrossEntropyLoss  # noqa: F401
+from .layer.loss import HSigmoidLoss  # noqa: F401
+from .layer.loss import MSELoss  # noqa: F401
+from .layer.loss import L1Loss  # noqa: F401
+from .layer.loss import NLLLoss  # noqa: F401
+from .layer.loss import BCELoss  # noqa: F401
+from .layer.loss import KLDivLoss  # noqa: F401
+from .layer.loss import MarginRankingLoss  # noqa: F401
+from .layer.loss import CTCLoss  # noqa: F401
+from .layer.loss import SmoothL1Loss  # noqa: F401
+from .layer.norm import BatchNorm  # noqa: F401
+from .layer.norm import SyncBatchNorm  # noqa: F401
+from .layer.norm import GroupNorm  # noqa: F401
+from .layer.norm import LayerNorm  # noqa: F401
+from .layer.norm import SpectralNorm  # noqa: F401
+from .layer.norm import InstanceNorm1D  # noqa: F401
+from .layer.norm import InstanceNorm2D  # noqa: F401
+from .layer.norm import InstanceNorm3D  # noqa: F401
+from .layer.norm import BatchNorm1D  # noqa: F401
+from .layer.norm import BatchNorm2D  # noqa: F401
+from .layer.norm import BatchNorm3D  # noqa: F401
+from .layer.norm import LocalResponseNorm  # noqa: F401
+
+from .layer.rnn import RNNCellBase  # noqa: F401
+from .layer.rnn import SimpleRNNCell  # noqa: F401
+from .layer.rnn import LSTMCell  # noqa: F401
+from .layer.rnn import GRUCell  # noqa: F401
+from .layer.rnn import RNN  # noqa: F401
+from .layer.rnn import BiRNN  # noqa: F401
+from .layer.rnn import SimpleRNN  # noqa: F401
+from .layer.rnn import LSTM  # noqa: F401
+from .layer.rnn import GRU  # noqa: F401
+
+from .layer.transformer import MultiHeadAttention  # noqa: F401
+from .layer.transformer import TransformerEncoderLayer  # noqa: F401
+from .layer.transformer import TransformerEncoder  # noqa: F401
+from .layer.transformer import TransformerDecoderLayer  # noqa: F401
+from .layer.transformer import TransformerDecoder  # noqa: F401
+from .layer.transformer import Transformer  # noqa: F401
+from .layer.distance import PairwiseDistance  # noqa: F401
+
+from .layer.vision import PixelShuffle  # noqa: F401
+from .layer.container import LayerDict  # noqa: F401
+
+from .utils.spectral_norm_hook import spectral_norm
+
+# TODO: remove loss, keep it for too many used in unitests
+from .layer import loss  # noqa: F401
+from ..fluid.dygraph.layers import Layer  # noqa: F401
+from ..fluid.dygraph.container import LayerList  # noqa: F401
+from ..fluid.dygraph.container import ParameterList  # noqa: F401
+from ..fluid.dygraph.container import Sequential  # noqa: F401
+
+from . import utils  # noqa: F401
+from . import functional  # noqa: F401
+from . import initializer  # noqa: F401
+
+#TODO: remove 'diag_embed', 'remove_weight_norm', 'weight_norm' months later.
+import paddle.utils.deprecated as deprecated
+
+
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.nn.funcitional.diag_embed",
+    level=1,
+    reason="diag_embed in paddle.nn will be removed in future")
+def diag_embed(*args):
+    '''
+        alias name of paddle.nn.functional.diag_embed
+    '''
+    return functional.diag_embed(*args)
+
+
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.nn.utils.remove_weight_norm",
+    level=1,
+    reason="remove_weight_norm in paddle.nn will be removed in future")
+def remove_weight_norm(*args):
+    '''
+        alias name of paddle.nn.utils.remove_weight_norm
+    '''
+    return utils.remove_weight_norm(*args)
+
+
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.nn.utils.weight_norm",
+    level=1,
+    reason="weight_norm in paddle.nn will be removed in future")
+def weight_norm(*args):
+    '''
+        alias name of paddle.nn.utils.weight_norm
+    '''
+    return utils.weight_norm(*args)
+
+
+__all__ = [     #noqa
+           'BatchNorm',
+           'GroupNorm',
+           'LayerNorm',
+           'SpectralNorm',
+           'BatchNorm1D',
+           'BatchNorm2D',
+           'BatchNorm3D',
+           'InstanceNorm1D',
+           'InstanceNorm2D',
+           'InstanceNorm3D',
+           'SyncBatchNorm',
+           'LocalResponseNorm',
+           'Embedding',
+           'Linear',
+           'Upsample',
+           'UpsamplingNearest2D',
+           'UpsamplingBilinear2D',
+           'Pad1D',
+           'Pad2D',
+           'Pad3D',
+           'CosineSimilarity',
+           'Dropout',
+           'Dropout2D',
+           'Dropout3D',
+           'Bilinear',
+           'AlphaDropout',
+           'Unfold',
+           'RNNCellBase',
+           'SimpleRNNCell',
+           'LSTMCell',
+           'GRUCell',
+           'RNN',
+           'BiRNN',
+           'SimpleRNN',
+           'LSTM',
+           'GRU',
+           'dynamic_decode',
+           'MultiHeadAttention',
+           'Maxout',
+           'Softsign',
+           'Transformer',
+           'MSELoss',
+           'LogSigmoid',
+           'BeamSearchDecoder',
+           'ClipGradByNorm',
+           'ReLU',
+           'PairwiseDistance',
+           'BCEWithLogitsLoss',
+           'SmoothL1Loss',
+           'MaxPool3D',
+           'AdaptiveMaxPool2D',
+           'Hardshrink',
+           'clip',
+           'Softplus',
+           'KLDivLoss',
+           'clip_by_norm',
+           'AvgPool2D',
+           'L1Loss',
+           'LeakyReLU',
+           'AvgPool1D',
+           'AdaptiveAvgPool3D',
+           'AdaptiveMaxPool3D',
+           'NLLLoss',
+           'Conv1D',
+           'Sequential',
+           'Hardswish',
+           'Conv1DTranspose',
+           'AdaptiveMaxPool1D',
+           'TransformerEncoder',
+           'Softmax',
+           'ParameterList',
+           'Conv2D',
+           'Softshrink',
+           'Hardtanh',
+           'TransformerDecoderLayer',
+           'CrossEntropyLoss',
+           'GELU',
+           'SELU',
+           'Silu',
+           'Conv2DTranspose',
+           'CTCLoss',
+           'ThresholdedReLU',
+           'AdaptiveAvgPool2D',
+           'MaxPool1D',
+           'Layer',
+           'TransformerDecoder',
+           'Conv3D',
+           'Tanh',
+           'Conv3DTranspose',
+           'Flatten',
+           'AdaptiveAvgPool1D',
+           'Tanhshrink',
+           'HSigmoidLoss',
+           'PReLU',
+           'TransformerEncoderLayer',
+           'AvgPool3D',
+           'MaxPool2D',
+           'MarginRankingLoss',
+           'LayerList',
+           'ClipGradByValue',
+           'BCELoss',
+           'Hardsigmoid',
+           'ClipGradByGlobalNorm',
+           'LogSoftmax',
+           'Sigmoid',
+           'Swish',
+           'PixelShuffle',
+           'ELU',
+           'ReLU6'
+]
diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py
index 9180a883e835c3..e868cbdbacc171 100644
--- a/python/paddle/nn/clip.py
+++ b/python/paddle/nn/clip.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 # TODO: define the functions to clip gradient of parameter  
-from ..fluid.clip import ClipGradByGlobalNorm  #DEFINE_ALIAS
-from ..fluid.clip import ClipGradByNorm  #DEFINE_ALIAS
-from ..fluid.clip import ClipGradByValue  #DEFINE_ALIAS
+from ..fluid.clip import ClipGradByGlobalNorm  # noqa: F401
+from ..fluid.clip import ClipGradByNorm  # noqa: F401
+from ..fluid.clip import ClipGradByValue  # noqa: F401
 
-__all__ = ['ClipGradByGlobalNorm', 'ClipGradByNorm', 'ClipGradByValue']
+__all__ = []
diff --git a/python/paddle/nn/decode.py b/python/paddle/nn/decode.py
index bba5aba0da9ad0..ff4a6e4f482af5 100644
--- a/python/paddle/nn/decode.py
+++ b/python/paddle/nn/decode.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ..fluid.layers import BeamSearchDecoder  #DEFINE_ALIAS
-from ..fluid.layers import dynamic_decode  #DEFINE_ALIAS
+from ..fluid.layers import BeamSearchDecoder  # noqa: F401
+from ..fluid.layers import dynamic_decode  # noqa: F401
 
-__all__ = [
-    'BeamSearchDecoder',
-    'dynamic_decode',
-]
+__all__ = []
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 36f39a5056ed54..d4c17a27a61780 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -14,208 +14,185 @@
 
 # TODO: import all neural network related api under this directory,
 # including layers, linear, conv, rnn etc.
-__all__ = []
 
-# TODO: define alias in functional directory
-from . import conv
-__all__ += conv.__all__
-from . import activation
-__all__ += activation.__all__
-from . import extension
-__all__ += extension.__all__
-from . import common
-__all__ += common.__all__
-from . import pooling
-__all__ += pooling.__all__
-from . import loss
-__all__ += loss.__all__
-from .activation import elu  #DEFINE_ALIAS
-from .activation import elu_  #DEFINE_ALIAS
-# from .activation import erf  #DEFINE_ALIAS
-from .activation import gelu  #DEFINE_ALIAS
-from .activation import hardshrink  #DEFINE_ALIAS
-from .activation import hardtanh  #DEFINE_ALIAS
-from .activation import hardsigmoid  #DEFINE_ALIAS
-from .activation import hardswish  #DEFINE_ALIAS
-from .activation import leaky_relu  #DEFINE_ALIAS
-from .activation import log_sigmoid  #DEFINE_ALIAS
-from .activation import maxout  #DEFINE_ALIAS
-from .activation import prelu  #DEFINE_ALIAS
-from .activation import relu  #DEFINE_ALIAS
-from .activation import relu_  #DEFINE_ALIAS
-from .activation import relu6  #DEFINE_ALIAS
-from .activation import selu  #DEFINE_ALIAS
-from .activation import sigmoid  #DEFINE_ALIAS
-# from .activation import soft_relu  #DEFINE_ALIAS
-from .activation import softmax  #DEFINE_ALIAS
-from .activation import softmax_  #DEFINE_ALIAS
-from .activation import softplus  #DEFINE_ALIAS
-from .activation import softshrink  #DEFINE_ALIAS
-from .activation import softsign  #DEFINE_ALIAS
-from .activation import swish  #DEFINE_ALIAS
-from .activation import tanh  #DEFINE_ALIAS
-from .activation import tanh_  #DEFINE_ALIAS
-from .activation import tanhshrink  #DEFINE_ALIAS
-from .activation import thresholded_relu  #DEFINE_ALIAS
-from .activation import log_softmax  #DEFINE_ALIAS
-from .common import dropout  #DEFINE_ALIAS
-from .common import dropout2d  #DEFINE_ALIAS
-from .common import dropout3d  #DEFINE_ALIAS
-from .common import alpha_dropout  #DEFINE_ALIAS
-# from .common import embedding        #DEFINE_ALIAS
-# from .common import fc  #DEFINE_ALIAS
-from .common import label_smooth
-# from .common import one_hot  #DEFINE_ALIAS
-from .common import pad  #DEFINE_ALIAS
-# from .common import pad_constant_like  #DEFINE_ALIAS
-# from .common import pad2d  #DEFINE_ALIAS
-from .common import cosine_similarity  #DEFINE_ALIAS
-from .common import unfold  #DEFINE_ALIAS
-# from .common import bilinear_tensor_product        #DEFINE_ALIAS
-from .common import interpolate  #DEFINE_ALIAS
-from .common import upsample  #DEFINE_ALIAS
-from .common import bilinear  #DEFINE_ALIAS
-from .conv import conv1d  #DEFINE_ALIAS
-from .conv import conv1d_transpose  #DEFINE_ALIAS
-from .common import linear  #DEFINE_ALIAS
-from .conv import conv2d  #DEFINE_ALIAS
-from .conv import conv2d_transpose  #DEFINE_ALIAS
-from .conv import conv3d  #DEFINE_ALIAS
-from .conv import conv3d_transpose  #DEFINE_ALIAS
-# from .extension import add_position_encoding  #DEFINE_ALIAS
-# from .extension import autoincreased_step_counter        #DEFINE_ALIAS
-# from .extension import continuous_value_model  #DEFINE_ALIAS
-# from .extension import filter_by_instag  #DEFINE_ALIAS
-# from .extension import linear_chain_crf        #DEFINE_ALIAS
-# from .extension import merge_selected_rows        #DEFINE_ALIAS
-# from .extension import multiclass_nms  #DEFINE_ALIAS
-# from .extension import polygon_box_transform  #DEFINE_ALIAS
-# from .extension import random_crop  #DEFINE_ALIAS
-# from .extension import rpn_target_assign  #DEFINE_ALIAS
-# from .extension import similarity_focus  #DEFINE_ALIAS
-# from .extension import target_assign  #DEFINE_ALIAS
-# from .extension import temporal_shift  #DEFINE_ALIAS
-# from .extension import warpctc  #DEFINE_ALIAS
-from .extension import diag_embed  #DEFINE_ALIAS
-# from .lod import sequence_concat        #DEFINE_ALIAS
-# from .lod import sequence_conv        #DEFINE_ALIAS
-# from .lod import sequence_enumerate        #DEFINE_ALIAS
-# from .lod import sequence_expand_as        #DEFINE_ALIAS
-# from .lod import sequence_expand        #DEFINE_ALIAS
-# from .lod import sequence_first_step        #DEFINE_ALIAS
-# from .lod import sequence_last_step        #DEFINE_ALIAS
-# from .lod import sequence_mask        #DEFINE_ALIAS
-# from .lod import sequence_pad        #DEFINE_ALIAS
-# from .lod import sequence_pool        #DEFINE_ALIAS
-# from .lod import sequence_reshape        #DEFINE_ALIAS
-# from .lod import sequence_reverse        #DEFINE_ALIAS
-# from .lod import sequence_scatter        #DEFINE_ALIAS
-# from .lod import sequence_slice        #DEFINE_ALIAS
-# from .lod import sequence_softmax        #DEFINE_ALIAS
-# from .lod import sequence_unpad        #DEFINE_ALIAS
-# from .lod import array_length        #DEFINE_ALIAS
-# from .lod import array_read        #DEFINE_ALIAS
-# from .lod import array_write        #DEFINE_ALIAS
-# from .lod import create_array        #DEFINE_ALIAS
-# from .lod import hash  #DEFINE_ALIAS
-# from .lod import im2sequence        #DEFINE_ALIAS
-# from .lod import lod_append        #DEFINE_ALIAS
-# from .lod import lod_reset        #DEFINE_ALIAS
-# from .lod import reorder_lod_tensor_by_rank        #DEFINE_ALIAS
-# from .lod import tensor_array_to_tensor        #DEFINE_ALIAS
-# from .lod import dynamic_gru        #DEFINE_ALIAS
-# from .lod import dynamic_lstm        #DEFINE_ALIAS
-# from .lod import dynamic_lstmp        #DEFINE_ALIAS
-from .loss import binary_cross_entropy  #DEFINE_ALIAS
-from .loss import binary_cross_entropy_with_logits  #DEFINE_ALIAS
-# from .loss import bpr_loss  #DEFINE_ALIAS
-# from .loss import center_loss  #DEFINE_ALIAS
-#from .loss import cross_entropy  #DEFINE_ALIAS
-from .loss import cross_entropy  #DEFINE_ALIAS
-from .loss import dice_loss  #DEFINE_ALIAS
-from .loss import hsigmoid_loss  #DEFINE_ALIAS
-from .loss import kl_div  #DEFINE_ALIAS
-from .loss import l1_loss  #DEFINE_ALIAS
-from .loss import log_loss  #DEFINE_ALIAS
-from .loss import margin_ranking_loss  #DEFINE_ALIAS
-from .loss import mse_loss  #DEFINE_ALIAS
-from .loss import nll_loss  #DEFINE_ALIAS
-# from .loss import nce        #DEFINE_ALIAS
-from .loss import npair_loss  #DEFINE_ALIAS
-from .loss import sigmoid_focal_loss  #DEFINE_ALIAS
-# from .loss import smooth_l1  #DEFINE_ALIAS
-from .loss import smooth_l1_loss  #DEFINE_ALIAS
-from .loss import softmax_with_cross_entropy  #DEFINE_ALIAS
-from .loss import square_error_cost  #DEFINE_ALIAS
-# from .loss import teacher_student_sigmoid_loss  #DEFINE_ALIAS
-from .loss import ctc_loss  #DEFINE_ALIAS
-# from .norm import data_norm        #DEFINE_ALIAS
-# from .norm import group_norm        #DEFINE_ALIAS
-from .norm import batch_norm  #DEFINE_ALIAS
-from .norm import instance_norm  #DEFINE_ALIAS
-from .norm import layer_norm  #DEFINE_ALIAS
-from .norm import local_response_norm  #DEFINE_ALIAS
-from .norm import normalize  #DEFINE_ALIAS
-# from .norm import spectral_norm        #DEFINE_ALIAS
-# from .pooling import pool2d  #DEFINE_ALIAS
-# from .pooling import pool3d  #DEFINE_ALIAS
-from .pooling import avg_pool1d  #DEFINE_ALIAS
-from .pooling import avg_pool2d  #DEFINE_ALIAS
-from .pooling import avg_pool3d  #DEFINE_ALIAS
-from .pooling import max_pool1d  #DEFINE_ALIAS
-from .pooling import max_pool2d  #DEFINE_ALIAS
-from .pooling import max_pool3d  #DEFINE_ALIAS
+from .activation import elu  # noqa: F401
+from .activation import elu_  # noqa: F401
+from .activation import gelu  # noqa: F401
+from .activation import hardshrink  # noqa: F401
+from .activation import hardtanh  # noqa: F401
+from .activation import hardsigmoid  # noqa: F401
+from .activation import hardswish  # noqa: F401
+from .activation import leaky_relu  # noqa: F401
+from .activation import log_sigmoid  # noqa: F401
+from .activation import maxout  # noqa: F401
+from .activation import prelu  # noqa: F401
+from .activation import relu  # noqa: F401
+from .activation import relu_  # noqa: F401
+from .activation import relu6  # noqa: F401
+from .activation import selu  # noqa: F401
+from .activation import sigmoid  # noqa: F401
+from .activation import silu  # noqa: F401
+from .activation import softmax  # noqa: F401
+from .activation import softmax_  # noqa: F401
+from .activation import softplus  # noqa: F401
+from .activation import softshrink  # noqa: F401
+from .activation import softsign  # noqa: F401
+from .activation import swish  # noqa: F401
+from .activation import tanh  # noqa: F401
+from .activation import tanh_  # noqa: F401
+from .activation import tanhshrink  # noqa: F401
+from .activation import thresholded_relu  # noqa: F401
+from .activation import log_softmax  # noqa: F401
+from .activation import glu  # noqa: F401
+from .common import dropout  # noqa: F401
+from .common import dropout2d  # noqa: F401
+from .common import dropout3d  # noqa: F401
+from .common import alpha_dropout  # noqa: F401
+from .common import label_smooth  # noqa: F401
+from .common import pad  # noqa: F401
+from .common import cosine_similarity  # noqa: F401
+from .common import unfold  # noqa: F401
+from .common import interpolate  # noqa: F401
+from .common import upsample  # noqa: F401
+from .common import bilinear  # noqa: F401
+from .conv import conv1d  # noqa: F401
+from .conv import conv1d_transpose  # noqa: F401
+from .common import linear  # noqa: F401
+from .conv import conv2d  # noqa: F401
+from .conv import conv2d_transpose  # noqa: F401
+from .conv import conv3d  # noqa: F401
+from .conv import conv3d_transpose  # noqa: F401
+from .extension import diag_embed  # noqa: F401
+from .extension import sequence_mask
+from .loss import binary_cross_entropy  # noqa: F401
+from .loss import binary_cross_entropy_with_logits  # noqa: F401
+from .loss import cross_entropy  # noqa: F401
+from .loss import dice_loss  # noqa: F401
+from .loss import hsigmoid_loss  # noqa: F401
+from .loss import kl_div  # noqa: F401
+from .loss import l1_loss  # noqa: F401
+from .loss import log_loss  # noqa: F401
+from .loss import margin_ranking_loss  # noqa: F401
+from .loss import mse_loss  # noqa: F401
+from .loss import nll_loss  # noqa: F401
+from .loss import npair_loss  # noqa: F401
+from .loss import sigmoid_focal_loss  # noqa: F401
+from .loss import smooth_l1_loss  # noqa: F401
+from .loss import softmax_with_cross_entropy  # noqa: F401
+from .loss import square_error_cost  # noqa: F401
+from .loss import ctc_loss  # noqa: F401
+from .norm import batch_norm  # noqa: F401
+from .norm import instance_norm  # noqa: F401
+from .norm import layer_norm  # noqa: F401
+from .norm import local_response_norm  # noqa: F401
+from .norm import normalize  # noqa: F401
+from .pooling import avg_pool1d  # noqa: F401
+from .pooling import avg_pool2d  # noqa: F401
+from .pooling import avg_pool3d  # noqa: F401
+from .pooling import max_pool1d  # noqa: F401
+from .pooling import max_pool2d  # noqa: F401
+from .pooling import max_pool3d  # noqa: F401
 
-from .pooling import adaptive_max_pool1d  #DEFINE_ALIAS
-from .pooling import adaptive_max_pool2d  #DEFINE_ALIAS
-from .pooling import adaptive_max_pool3d  #DEFINE_ALIAS
-from .pooling import adaptive_avg_pool1d  #DEFINE_ALIAS
-from .pooling import adaptive_avg_pool2d  #DEFINE_ALIAS
-from .pooling import adaptive_avg_pool3d  #DEFINE_ALIAS
+from .pooling import adaptive_max_pool1d  # noqa: F401
+from .pooling import adaptive_max_pool2d  # noqa: F401
+from .pooling import adaptive_max_pool3d  # noqa: F401
+from .pooling import adaptive_avg_pool1d  # noqa: F401
+from .pooling import adaptive_avg_pool2d  # noqa: F401
+from .pooling import adaptive_avg_pool3d  # noqa: F401
 
-# from .rnn import rnn  #DEFINE_ALIAS
-# from .rnn import birnn  #DEFINE_ALIAS
-# from .rnn import gru_unit        #DEFINE_ALIAS
-# from .rnn import lstm        #DEFINE_ALIAS
-# from .rnn import lstm_unit        #DEFINE_ALIAS
-# from .vision import affine_channel  #DEFINE_ALIAS
-from .vision import affine_grid  #DEFINE_ALIAS
-# from .vision import anchor_generator  #DEFINE_ALIAS
-# from .vision import bipartite_match  #DEFINE_ALIAS
-# from .vision import box_clip  #DEFINE_ALIAS
-# from .vision import box_coder  #DEFINE_ALIAS
-# from .vision import box_decoder_and_assign  #DEFINE_ALIAS
-# from .vision import collect_fpn_proposals  #DEFINE_ALIAS
-# from .vision import deformable_conv  #DEFINE_ALIAS
-# from .vision import deformable_roi_pooling  #DEFINE_ALIAS
-# from .vision import density_prior_box  #DEFINE_ALIAS
-# from .vision import detection_output  #DEFINE_ALIAS
-# from .vision import distribute_fpn_proposals  #DEFINE_ALIAS
-# from .vision import fsp_matrix  #DEFINE_ALIAS
-# from .vision import generate_mask_labels  #DEFINE_ALIAS
-# from .vision import generate_proposal_labels  #DEFINE_ALIAS
-# from .vision import generate_proposals  #DEFINE_ALIAS
-from .vision import grid_sample  #DEFINE_ALIAS
-# from .vision import image_resize  #DEFINE_ALIAS
-# from .vision import image_resize_short  #DEFINE_ALIAS
-# from .vision import multi_box_head  #DEFINE_ALIAS
-from .vision import pixel_shuffle  #DEFINE_ALIAS
-# from .vision import prior_box  #DEFINE_ALIAS
-# from .vision import prroi_pool  #DEFINE_ALIAS
-# from .vision import psroi_pool  #DEFINE_ALIAS
-# from .vision import resize_bilinear  #DEFINE_ALIAS
-# from .vision import resize_nearest  #DEFINE_ALIAS
-# from .vision import resize_trilinear  #DEFINE_ALIAS
-# from .vision import retinanet_detection_output  #DEFINE_ALIAS
-# from .vision import retinanet_target_assign  #DEFINE_ALIAS
-# from .vision import roi_align  #DEFINE_ALIAS
-# from .vision import roi_perspective_transform  #DEFINE_ALIAS
-# from .vision import roi_pool  #DEFINE_ALIAS
-# from .vision import shuffle_channel  #DEFINE_ALIAS
-# from .vision import space_to_depth  #DEFINE_ALIAS
-# from .vision import yolo_box  #DEFINE_ALIAS
-# from .vision import yolov3_loss  #DEFINE_ALIAS
-from .input import one_hot  #DEFINE_ALIAS
-from .input import embedding  #DEFINE_ALIAS
-from ...fluid.layers import gather_tree
-from ...fluid.layers import temporal_shift
+from .vision import affine_grid  # noqa: F401
+from .vision import grid_sample  # noqa: F401
+from .vision import pixel_shuffle  # noqa: F401
+from .input import one_hot  # noqa: F401
+from .input import embedding  # noqa: F401
+from ...fluid.layers import gather_tree  # noqa: F401
+from ...fluid.layers import temporal_shift  # noqa: F401
+
+__all__ = [     #noqa
+           'conv1d',
+           'conv1d_transpose',
+           'conv2d',
+           'conv2d_transpose',
+           'conv3d',
+           'conv3d_transpose',
+           'elu',
+           'elu_',
+           'gelu',
+           'hardshrink',
+           'hardtanh',
+           'hardsigmoid',
+           'hardswish',
+           'leaky_relu',
+           'log_sigmoid',
+           'maxout',
+           'prelu',
+           'relu',
+           'relu_',
+           'relu6',
+           'selu',
+           'softmax',
+           'softmax_',
+           'softplus',
+           'softshrink',
+           'softsign',
+           'sigmoid',
+           'silu',
+           'swish',
+           'tanh',
+           'tanh_',
+           'tanhshrink',
+           'thresholded_relu',
+           'log_softmax',
+           'glu',
+           'diag_embed',
+           'sequence_mask',
+           'dropout',
+           'dropout2d',
+           'dropout3d',
+           'alpha_dropout',
+           'label_smooth',
+           'linear',
+           'pad',
+           'unfold',
+           'interpolate',
+           'upsample',
+           'bilinear',
+           'cosine_similarity',
+           'avg_pool1d',
+           'avg_pool2d',
+           'avg_pool3d',
+           'max_pool1d',
+           'max_pool2d',
+           'max_pool3d',
+           'adaptive_avg_pool1d',
+           'adaptive_avg_pool2d',
+           'adaptive_avg_pool3d',
+           'adaptive_max_pool1d',
+           'adaptive_max_pool2d',
+           'adaptive_max_pool3d',
+           'binary_cross_entropy',
+           'binary_cross_entropy_with_logits',
+           'cross_entropy',
+           'dice_loss',
+           'hsigmoid_loss',
+           'kl_div',
+           'l1_loss',
+           'log_loss',
+           'mse_loss',
+           'margin_ranking_loss',
+           'nll_loss',
+           'npair_loss',
+           'sigmoid_focal_loss',
+           'smooth_l1_loss',
+           'softmax_with_cross_entropy',
+           'square_error_cost',
+           'ctc_loss',
+           'affine_grid',
+           'grid_sample',
+           'local_response_norm',
+           'pixel_shuffle',
+           'embedding',
+           'gather_tree',
+           'one_hot',
+           'normalize'
+]
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 3553a93dfab20d..d5dc6322522bb4 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -12,48 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: define activation functions of neural network
-from ...fluid.layers import brelu  #DEFINE_ALIAS
-# from ...fluid.layers import erf  #DEFINE_ALIAS
-from ...fluid.layers import maxout  #DEFINE_ALIAS
-# from ...fluid.layers import soft_relu  #DEFINE_ALIAS
-from ...fluid.layers import swish  #DEFINE_ALIAS
-from ...fluid.layers import sigmoid  #DEFINE_ALIAS
-from ...tensor.math import tanh  #DEFINE_ALIAS
-from ...tensor.math import tanh_  #DEFINE_ALIAS
-
-from ...tensor.manipulation import _print_warning_in_static_mode
-
-__all__ = [
-    'brelu',
-    'elu',
-    'elu_',
-    'gelu',
-    'hardshrink',
-    'hardtanh',
-    'hardsigmoid',
-    'hardswish',
-    'leaky_relu',
-    'log_sigmoid',
-    'maxout',
-    'prelu',
-    'relu',
-    'relu_',
-    'relu6',
-    'selu',
-    'softmax',
-    'softmax_',
-    'softplus',
-    'softshrink',
-    'softsign',
-    'sigmoid',
-    'swish',
-    'tanh',
-    'tanh_',
-    'tanhshrink',
-    'thresholded_relu',
-    'log_softmax',
-]
+from ...fluid.layers import sigmoid  # noqa: F401
+from ...tensor.math import tanh  # noqa: F401
+from ...tensor.math import tanh_  # noqa: F401
+
+from ...fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
+from ...tensor.manipulation import chunk
+from ...tensor.math import multiply
 
 import warnings
 from ...fluid.layer_helper import LayerHelper
@@ -62,6 +27,8 @@
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 import paddle
 
+__all__ = []
+
 
 def elu(x, alpha=1.0, name=None):
     r"""
@@ -106,17 +73,13 @@ def elu(x, alpha=1.0, name=None):
     return out
 
 
+@inplace_apis_in_dygraph_only
 def elu_(x, alpha=1.0, name=None):
     r"""
     Inplace version of ``elu`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_nn_cn_elu`.
     """
-
-    if in_dygraph_mode():
-        return core.ops.elu_(x, 'alpha', alpha)
-
-    _print_warning_in_static_mode("elu")
-    return elu(x, alpha, name)
+    return core.ops.elu_(x, 'alpha', alpha)
 
 
 def gelu(x, approximate=False, name=None):
@@ -534,17 +497,13 @@ def relu(x, name=None):
     return out
 
 
+@inplace_apis_in_dygraph_only
 def relu_(x, name=None):
     """
     Inplace version of ``relu`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_nn_cn_relu`.
     """
-
-    if in_dygraph_mode():
-        return core.ops.relu_(x)
-
-    _print_warning_in_static_mode("relu")
-    return relu(x, name)
+    return core.ops.relu_(x)
 
 
 def log_sigmoid(x, name=None):
@@ -758,6 +717,39 @@ def selu(x,
     return out
 
 
+def silu(x, name=None):
+    """
+    silu activation.
+    .. math:
+        silu(x) = \frac{x}{1 + e^{-x}}
+    
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+    
+    Examples:
+        .. code-block:: python
+        import paddle
+        import paddle.nn.functional as F
+        
+        x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+        out = F.silu(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.silu(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'silu')
+    helper = LayerHelper("silu", **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='silu', inputs={'X': x}, outputs={'Out': out})
+    return out
+
+
 def softmax(x, axis=-1, dtype=None, name=None):
     r"""
     This operator implements the softmax layer. The calculation process is as follows:
@@ -912,21 +904,16 @@ def softmax(x, axis=-1, dtype=None, name=None):
     return outs_softmax
 
 
+@inplace_apis_in_dygraph_only
 def softmax_(x, axis=-1, dtype=None, name=None):
     r"""
     Inplace version of ``softmax`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_nn_cn_softmax`.
     """
-
     if (dtype is not None) and (not isinstance(dtype, core.VarDesc.VarType)):
         dtype = convert_np_dtype_to_dtype_(dtype)
     use_cudnn = True
-
-    if in_dygraph_mode():
-        return core.ops.softmax_(x, 'axis', axis, 'use_cudnn', use_cudnn)
-
-    _print_warning_in_static_mode("softmax")
-    return softmax(x, axis, dtype, name)
+    return core.ops.softmax_(x, 'axis', axis, 'use_cudnn', use_cudnn)
 
 
 def softplus(x, beta=1, threshold=20, name=None):
@@ -1276,3 +1263,50 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
         attrs={'axis': axis})
 
     return out
+
+
+def glu(x, axis=-1, name=None):
+    r"""
+    The gated linear unit. The input is evenly splited into 2 parts along a 
+    given axis. The first part is used as the content, and the second part is
+    passed through a sigmoid function then used as the gate. The output is a
+    elementwise multiplication of the content and the gate.
+
+    .. math::
+
+        \mathrm{GLU}(a, b) = a \otimes \sigma(b)
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int, optional): The axis along which split the input tensor. It 
+            should be in range [-D, D), where D is the dimensions of ``x`` . 
+            If ``axis`` < 0, it works the same way as :math:`axis + D` . 
+            Default is -1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        A Tensor with the same data type as x. The size of the given aixs is 
+        halved.
+    
+    Examples:
+        .. code-block:: python
+        
+            import paddle
+            from paddle.nn import functional as F
+            
+            x = paddle.to_tensor(
+                [[-0.22014759, -1.76358426,  0.80566144,  0.04241343],
+                 [-1.94900405, -1.89956081,  0.17134808, -1.11280477]]
+            )
+            print(F.glu(x).numpy())
+            # array([[-0.15216254, -0.9004892 ],
+            #        [-1.0577879 , -0.46985325]], dtype=float32)
+        
+    """
+    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                             "glu")
+    a, b = chunk(x, 2, axis=axis, name=name)
+    gate = sigmoid(b, name=name)
+    out = paddle.multiply(a, gate, name=name)
+    return out
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 0859d05af1cf90..65b9c6771c4f19 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -20,43 +20,21 @@
 from ...fluid.layers import core
 from ...fluid import dygraph_utils
 # TODO: define the common functions to build a neural network  
-# from ...fluid import one_hot  #DEFINE_ALIAS
-# from ...fluid.layers import pad2d  #DEFINE_ALIAS
-from ...fluid.layers import unfold  #DEFINE_ALIAS
-from ...fluid.layers import squeeze  #DEFINE_ALIAS
-from ...fluid.layers import unsqueeze  #DEFINE_ALIAS
+from ...fluid.layers import unfold  # noqa: F401
+from ...fluid.layers import squeeze
+from ...fluid.layers import unsqueeze
 from ...tensor import clip
 from ...tensor import sum
 from ...tensor import sqrt
-from ...tensor import sum  #DEFINE_ALIAS
-from ...tensor import sqrt  #DEFINE_ALIAS
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 from ...fluid.framework import Variable, in_dygraph_mode, _varbase_creator
 
-#from ...fluid.layers import fc  #DEFINE_ALIAS
-# from ...fluid.layers import pad_constant_like  #DEFINE_ALIAS
 from ...fluid.framework import in_dygraph_mode
 from ...fluid import core, dygraph_utils
 from ...fluid import core, layers
 from ...fluid.data_feeder import check_variable_and_dtype
 
-__all__ = [
-    'dropout',
-    'dropout2d',
-    'dropout3d',
-    'alpha_dropout',
-    #       'embedding',
-    #       'fc',
-    'label_smooth',
-    'linear',
-    'pad',
-    'unfold',
-    #       'bilinear_tensor_product',
-    'interpolate',
-    'upsample',
-    'bilinear',
-    'cosine_similarity',
-]
+__all__ = []
 
 
 def interpolate(x,
@@ -207,7 +185,7 @@ def interpolate(x,
         size (list|tuple|Tensor|None): Output shape of image resize
              layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
-             Default: None. If a list, each element can be an integer or a Tensor of shape: [1].
+             Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor, its dimensions size should be a 1.
         scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
              least one of :attr:`size` or :attr:`scale_factor` must be set.
@@ -638,7 +616,7 @@ def upsample(x,
         size (list|tuple|Tensor|None): Output shape of image resize
              layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
-             Default: None. If a list, each element can be an integer or a Tensor of shape: [1].
+             Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor , its dimensions size should be a 1.
         scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
              least one of :attr:`size` or :attr:`scale_factor` must be set.
@@ -764,8 +742,8 @@ def dropout(x,
 
     Args:
         x (Tensor): The input tensor. The data type is float32 or float64.
-        p (float | int): Probability of setting units to zero. Default 0.5.
-        axis (int | list): The axis along which the dropout is performed. Default None.
+        p (float|int): Probability of setting units to zero. Default 0.5.
+        axis (int|list|tuple): The axis along which the dropout is performed. Default None.
         training (bool): A flag indicating whether it is in train phrase or not. Default True.
         mode(str): ['upscale_in_train'(default) | 'downscale_in_infer'].
 
@@ -896,7 +874,7 @@ def dropout(x,
     if mode not in ('downscale_in_infer', 'upscale_in_train'):
         raise ValueError(
             "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
-    if axis and not isinstance(axis, (int, list)):
+    if axis and not isinstance(axis, (int, list, tuple)):
         raise TypeError("datatype of axis argument should be int or list")
 
     if axis == None:  # commonly used dropout
@@ -955,7 +933,7 @@ def get_attrs(prog, dropout_prob, is_test, seed):
 
             #get mask shape
             input_shape = x.shape
-            drop_axes = [axis] if isinstance(axis, int) else axis
+            drop_axes = [axis] if isinstance(axis, int) else list(axis)
             if min(drop_axes) < 0 or max(drop_axes) > len(input_shape) - 1:
                 raise ValueError("axis value should be greater than or equal to 0 and less than dimensions of x:{}, but get axis value:{} " \
                                  .format(len(input_shape), max(drop_axes)))
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 0789d0b67b7400..1edbc5f462ecd7 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -13,15 +13,6 @@
 # limitations under the License.
 from __future__ import print_function
 
-__all__ = [
-    'conv1d',
-    'conv1d_transpose',
-    'conv2d',
-    'conv2d_transpose',
-    'conv3d',
-    'conv3d_transpose',
-]
-
 import numpy as np
 from ...device import get_cudnn_version
 from ...fluid.framework import Variable, in_dygraph_mode
@@ -31,6 +22,8 @@
 from ...fluid.param_attr import ParamAttr
 from ...fluid.layer_helper import LayerHelper
 
+__all__ = []
+
 
 def _is_list_or_tuple(input):
     return isinstance(input, (list, tuple))
@@ -218,7 +211,7 @@ def conv1d(x,
         weight (Tensor): The convolution kernel with shape [M, C/g, K], where M is
             the number of output channels, g is the number of groups, K is the kernel's size. 
         bias (Tensor, optional): The bias with shape [M,]. Default: None.
-        stride (int or tuple, optional): The stride size. If stride is a tuple, it must
+        stride (int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain one integers, (stride_size). Default: 1.
         padding(int|str|tuple|list, optional): The padding size. Padding could be in one of the following forms.
             1. a string in ['valid', 'same'].
@@ -227,7 +220,7 @@ def conv1d(x,
             4. a list[int] or tuple[int] whose length is 2. It has the form  [pad_before, pad_after].
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        dilation (int or tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation (int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
             contain one integer, (dilation_size). Default: 1.
         groups (int, optional): The groups number of the conv1d function. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
@@ -250,7 +243,7 @@ def conv1d(x,
         ValueError: If the channel dimension of the input is less than or equal to zero.
         ValueError: If `data_format` is not "NCL" or "NLC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
         ShapeError: If the input is not 3-D Tensor.
         ShapeError: If the input's dimension size and filter's dimension size not equal.
@@ -414,7 +407,7 @@ def conv2d(x,
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -441,8 +434,8 @@ def conv2d(x,
 
         ..  math::
 
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+            H_{out}&= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
 
     Args:
         x (Tensor): The input is 4-D Tensor with shape [N, C, H, W], the data type 
@@ -451,8 +444,8 @@ def conv2d(x,
             the number of output channels, g is the number of groups, kH is the filter's
             height, kW is the filter's width. 
         bias (Tensor, optional): The bias with shape [M,].
-        stride (int|tuple): The stride size. It means the stride in convolution. 
-            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
+        stride (int|list|tuple): The stride size. It means the stride in convolution. 
+            If stride is a list/tuple, it must contain two integers, (stride_height, stride_width). 
             Otherwise, stride_height = stride_width = stride. Default: stride = 1.
         padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
             on both sides for each dimension.If `padding` is a string, either 'VALID' or
@@ -464,8 +457,8 @@ def conv2d(x,
             when `data_format` is `"NHWC"`, `padding` can be in the form
             `[[0,0], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        dilation (int|tuple): The dilation size. It means the spacing between the kernel
-            points. If dilation is a tuple, it must contain two integers, (dilation_height, 
+        dilation (int|list|tuple): The dilation size. It means the spacing between the kernel
+            points. If dilation is a list/tuple, it must contain two integers, (dilation_height, 
             dilation_width). Otherwise, dilation_height = dilation_width = dilation. 
             Default: dilation = 1.
         groups (int): The groups number of the Conv2D Layer. According to grouped
@@ -488,7 +481,7 @@ def conv2d(x,
         ValueError: If `data_format` is not "NCHW" or "NHWC".
         ValueError: If the channel dimension of the input is less than or equal to zero.
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
         ShapeError: If the input is not 4-D Tensor.
         ShapeError: If the input's dimension size and filter's dimension size not equal.
@@ -627,7 +620,7 @@ def conv1d_transpose(x,
           so for conv1d_transpose, when stride > 1, input shape maps multiple output shape.
           If output_size is None, :math:`L_{out} = L^\prime_{out}`;
           else, the :math:`L_{out}` of the output size must between :math:`L^\prime_{out}`
-          and :math:`L^\prime_{out} + stride`. conv1d_transpose can compute the kernel size automatically.
+          and :math:`L^\prime_{out} + stride`.
 
     Args:
         x(Tensor): 3-D tensor with [N, C, L] or [N, L, C] format,
@@ -637,7 +630,7 @@ def conv1d_transpose(x,
             K is the size of the kernel.
         bias(Tensor, optional): The bias, a Tensor with shape [M, ].
         stride(int|tuple|list, optional): The stride size. It means the stride in transposed convolution.
-            If stride is a tuple, it must contain one integer, `(stride_size)`.
+            If stride is a list/tuple, it must contain one integer, `(stride_size)`.
             Default: stride = 1.
         padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
              `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
@@ -645,7 +638,7 @@ def conv1d_transpose(x,
              If `padding` is a tuple or list, it could be in two forms:
              `[pad]` or `[pad_left, pad_right]`. Default: padding = 0.
         output_padding(int|list|tuple, optional): The count of zeros to be added to tail of each dimension.
-             If it is a tuple, it must contain one integer. Default: 0.
+             If it is a list/tuple, it must contain one integer. Default: 0.
         groups(int, optional): The groups number of the conv1d transpose function. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
@@ -653,14 +646,11 @@ def conv1d_transpose(x,
             filters is only connected to the second half of the input channels.
             Default: groups = 1.
         dilation(int|tuple|list, optional): The dilation size. It means the spacing between the kernel points.
-            If dilation is a tuple, it must contain one integer, `(dilation_size)`.
+            If dilation is a list/tuple, it must contain one integer, `(dilation_size)`.
             Default: dilation = 1.
         output_size(int|tuple|list, optional): The output image size. If output size is a
-            tuple, it must contain one integer, `(feature_length)`. None if use
-            filter_size, padding, and stride to calculate output_size.
-            If output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None. output_size and filter_size
-            should not be None at the same time.
+            tuple/list, it must contain one integer, `(feature_length)`. None if use
+            filter_size(shape of weight), padding, and stride to calculate output_size.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCL"`, `"NLC"`.
             The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
@@ -678,7 +668,7 @@ def conv1d_transpose(x,
     Raises:
         ValueError: If `data_format` is a string, but not "NCL" or "NLC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
         ValueError: If `output_size` and filter_size are None at the same time.
         ValueError: If `output_padding` is greater than `stride`.
@@ -854,7 +844,7 @@ def conv2d_transpose(x,
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -893,8 +883,7 @@ def conv2d_transpose(x,
           If output_size is None, :math:`H_{out} = H^\prime_{out}, W_{out} = W^\prime_{out}`; 
           else, the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
           and :math:`H^\prime_{out} + strides[0]`, and the :math:`W_{out}` of the output size must 
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`, 
-          conv2d_transpose can compute the kernel size automatically.
+          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`.
 
     Args:
         x(Tensor): 4-D Tensor with [N, C, H, W] or [N, H, W, C] format,
@@ -904,7 +893,7 @@ def conv2d_transpose(x,
             kH is the height of the kernel, and kW is the width of the kernel.
         bias(Tensor, optional): The bias, a Tensor with shape [M, ].
         stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain two integers, (stride_height, stride_width). 
+            If stride is a list/tuple, it must contain two integers, (stride_height, stride_width). 
             Otherwise, stride_height = stride_width = stride. Default: stride = 1.
         padding(str|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
             on both sides for each dimension. If `padding` is a string, either 'VALID' or 
@@ -925,14 +914,11 @@ def conv2d_transpose(x,
             filters is only connected to the second half of the input channels.
             Default: groups = 1.
         dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain two integers, (dilation_height, dilation_width). 
+            If dilation is a list/tuple, it must contain two integers, (dilation_height, dilation_width). 
             Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
         output_size(int|tuple|list, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_height, image_width). None if use
-            filter_size, padding, and stride to calculate output_size.
-            If output_size is specified, output_size and filter_size (weight)'s shape 
-            should follow the formula above. Default: None. output_size and filter_size 
-            should not be None at the same time.
+            tuple/list, it must contain two integers, (image_height, image_width). None if use
+            filter_size(shape of weight), padding, and stride to calculate output_size.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -950,7 +936,7 @@ def conv2d_transpose(x,
     Raises:
         ValueError: If `data_format` is not "NCHW" or "NHWC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
         ValueError: If `output_size` and kernel_size are None at the same time.
         ShapeError: If the input is not 4-D Tensor.
@@ -1090,7 +1076,7 @@ def conv3d(x,
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     In the above equation:
 
@@ -1127,8 +1113,8 @@ def conv3d(x,
             where M is the number of filters(output channels), g is the number of groups,
             kD, kH, kW are the filter's depth, height and width respectively.
         bias (Tensor, optional): The bias, a Tensor of shape [M, ].
-        stride (int|tuple): The stride size. It means the stride in convolution. If stride is a 
-            tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
+        stride (int|list|tuple): The stride size. It means the stride in convolution. If stride is a 
+            list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
             Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
         padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
             on both sides for each dimension. If `padding` is a string, either 'VALID' or
@@ -1140,8 +1126,8 @@ def conv3d(x,
             when `data_format` is `"NDHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        dilation (int|tuple): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height,
+        dilation (int|list|tuple): The dilation size. It means the spacing between the kernel points. 
+            If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height,
             dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
             Default: dilation = 1.
         groups (int): The groups number of the Conv3D Layer. According to grouped
@@ -1246,7 +1232,7 @@ def conv3d_transpose(x,
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     In the above equation:
 
@@ -1289,8 +1275,7 @@ def conv3d_transpose(x,
           size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, 
           the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
           and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
-          conv3d_transpose can compute the kernel size automatically.
+          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`.
 
     Args:
         x(Tensor): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type 
@@ -1300,7 +1285,7 @@ def conv3d_transpose(x,
             kD, kH, kW are the filter's depth, height and width respectively.
         bias (Tensor, optional): The bias, a Tensor of shape [M, ].
         stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
+            If stride is a list/tuple, it must contain three integers, (stride_depth, stride_height, 
             stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
             Default: stride = 1.
         padding (string|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
@@ -1322,14 +1307,12 @@ def conv3d_transpose(x,
             filters is only connected to the second half of the input channels.
             Default: groups=1
         dilation(int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
-            If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height, 
+            If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height, 
             dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
             Default: dilation = 1.
         output_size(int|list|tuple, optional): The output image size. If output size is a
-            tuple, it must contain three integers, (image_depth, image_height, image_width). This
-            parameter only works when filter_size is None. If output_size and filter_size are 
-            specified at the same time, They should follow the formula above. Default: None. 
-            Output_size and filter_size should not be None at the same time.
+            list/tuple, it must contain three integers, (image_depth, image_height, image_width).
+            None if use filter_size(shape of weight), padding, and stride to calculate output_size.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -1348,7 +1331,7 @@ def conv3d_transpose(x,
     Raises:
         ValueError: If `data_format` is not "NCDHW" or "NDHWC".
         ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a tuple, but the element corresponding to the input's batch size is not 0 
+        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
             or the element corresponding to the input's channel is not 0.
         ValueError: If `output_size` and kernel_size are None at the same time.
         ShapeError: If the input is not 5-D Tensor.
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 3bbdb89f16c0a1..8a9597119ab8df 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -14,8 +14,6 @@
 
 # TODO: define the extention functions
 
-__all__ = ['diag_embed']
-
 import numpy as np
 from ...fluid.data_feeder import check_dtype
 from ...fluid.layer_helper import LayerHelper
@@ -23,6 +21,9 @@
 from ...fluid.layers.tensor import assign
 from ...fluid import core, dygraph_utils
 from ...fluid.layers.layer_function_generator import templatedoc
+from ...fluid.layers.sequence_lod import sequence_mask
+
+__all__ = []
 
 
 def diag_embed(input, offset=0, dim1=-2, dim2=-1):
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index b88a2b042ff481..67dc69c1a93b69 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -19,7 +19,7 @@
 from ...fluid.layers import core
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 
-__all__ = ['one_hot', 'embedding']
+__all__ = []
 
 
 def one_hot(x, num_classes, name=None):
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 6c8a2d1cbce850..aa0bd8a8c5e3d5 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -24,14 +24,14 @@
 import paddle.fluid as fluid
 from ...fluid.framework import core, in_dygraph_mode
 from ...fluid.layers.nn import _elementwise_op_in_dygraph
-from ...fluid.layers import dice_loss  #DEFINE_ALIAS
-from ...fluid.layers import log_loss  #DEFINE_ALIAS
-from ...fluid.layers import npair_loss  #DEFINE_ALIAS
+from ...fluid.layers import dice_loss  # noqa: F401
+from ...fluid.layers import log_loss  # noqa: F401
+from ...fluid.layers import npair_loss  # noqa: F401
 from ...fluid.layers import reshape
-from ...fluid.layers import softmax_with_cross_entropy as fluid_softmax_with_cross_entropy  #DEFINE_ALIAS
-from ...fluid.layers import square_error_cost  #DEFINE_ALIAS
+from ...fluid.layers import softmax_with_cross_entropy as fluid_softmax_with_cross_entropy
+from ...fluid.layers import square_error_cost  # noqa: F401
 
-from ...fluid.layers import edit_distance  #DEFINE_ALIAS
+from ...fluid.layers import edit_distance  # noqa: F401
 from ...fluid.layers import huber_loss
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode
@@ -39,26 +39,7 @@
 from ...fluid.framework import Variable
 from paddle.utils import deprecated
 
-__all__ = [
-    'binary_cross_entropy',
-    'binary_cross_entropy_with_logits',
-    'cross_entropy',
-    'dice_loss',
-    'hsigmoid_loss',
-    'kl_div',
-    'l1_loss',
-    'log_loss',
-    'mse_loss',
-    'margin_ranking_loss',
-    #       'nce',
-    'nll_loss',
-    'npair_loss',
-    'sigmoid_focal_loss',
-    'smooth_l1_loss',
-    'softmax_with_cross_entropy',
-    'square_error_cost',
-    'ctc_loss',
-]
+__all__ = []
 
 
 def binary_cross_entropy(input, label, weight=None, reduction='mean',
@@ -1023,7 +1004,8 @@ def ctc_loss(log_probs,
              input_lengths,
              label_lengths,
              blank=0,
-             reduction='mean'):
+             reduction='mean',
+             norm_by_times=False):
     """
 
     An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
@@ -1038,6 +1020,7 @@ def ctc_loss(log_probs,
         label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
         blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
         reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
+        norm_by_times (bool, default False) – Whether to normalize the gradients by the number of time-step, which is also the sequence’s length. There is no need to normalize the gradients if reduction mode is 'mean'.
 
     Returns:
         Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
@@ -1101,7 +1084,7 @@ def ctc_loss(log_probs,
 
     """
 
-    loss_out = fluid.layers.warpctc(log_probs, labels, blank, False,
+    loss_out = fluid.layers.warpctc(log_probs, labels, blank, norm_by_times,
                                     input_lengths, label_lengths)
 
     loss_out = fluid.layers.squeeze(loss_out, [-1])
@@ -1113,7 +1096,13 @@ def ctc_loss(log_probs,
     return loss_out
 
 
-@deprecated(since="2.0.0", update_to="paddle.nn.functional.cross_entropy")
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.nn.functional.cross_entropy",
+    level=1,
+    reason=(
+        'Please notice that behavior of "paddle.nn.functional.softmax_with_cross_entropy" '
+        'and "paddle.nn.functional.cross_entropy" is different.'))
 def softmax_with_cross_entropy(logits,
                                label,
                                soft_label=False,
@@ -1310,7 +1299,7 @@ def cross_entropy(input,
             Indicate whether compute softmax before cross_entropy.
             Default is ``True``.
 
-        - **name** (str，optional)
+        - **name** (str, optional)
 
             The name of the operator. Default is ``None`` .
             For more information, please refer to :ref:`api_guide_Name` .
@@ -1388,8 +1377,6 @@ def cross_entropy(input,
             "should be '-100', but received %s, which is not allowed." %
             ignore_index)
 
-    softmax_switch = use_softmax
-
     input_dims = len(list(input.shape))
     label_dims = len(list(label.shape))
     if input_dims - 1 != label_dims and input_dims != label_dims:
@@ -1402,7 +1389,7 @@ def cross_entropy(input,
         _, out = core.ops.softmax_with_cross_entropy(
             input, label, 'soft_label', soft_label, 'ignore_index',
             ignore_index, 'numeric_stable_mode', True, 'axis', axis,
-            'softmax_switch', softmax_switch)
+            'use_softmax', use_softmax)
 
         if weight is not None:
 
@@ -1452,20 +1439,20 @@ def cross_entropy(input,
                 if weight is None:
                     mask = paddle.cast(mask, dtype=out_sum.dtype)
                     count = core.ops.reduce_sum(mask, 'reduce_all', True)
-                    ret = out_sum / count
+                    ret = out_sum / (count + (count == 0.0))
                 else:
                     mask = paddle.cast(mask, weight_gather_reshape.dtype)
                     weight_ignored = core.ops.elementwise_mul(
                         mask, weight_gather_reshape)
                     weight_sum = core.ops.reduce_sum(weight_ignored,
                                                      'reduce_all', True)
-                    ret = out_sum / weight_sum
+                    ret = out_sum / (weight_sum + (weight_sum == 0.0))
                 return ret
             elif weight is not None:
                 out_sum = core.ops.reduce_sum(out, 'reduce_all', True)
                 total_weight = core.ops.reduce_sum(weight_gather_reshape,
                                                    'reduce_all', True)
-                return out_sum / total_weight
+                return out_sum / (total_weight + (total_weight == 0.0))
             else:
                 return core.ops.mean(out)
 
@@ -1484,7 +1471,7 @@ def cross_entropy(input,
         'ignore_index': ignore_index,
         'numeric_stable_mode': True,
         'axis': axis,
-        'softmax_switch': softmax_switch
+        'use_softmax': use_softmax
     }
     helper = LayerHelper('softmax_with_cross_entropy', **locals())
     softmax = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -1535,17 +1522,17 @@ def cross_entropy(input,
             if (weight is None):
                 mask = paddle.cast(mask, dtype=out_sum.dtype)
                 count = paddle.sum(mask, name=name)
-                ret = out_sum / count
+                ret = out_sum / (count + (count == 0.0))
             else:
                 mask = paddle.cast(mask, weight_gather_reshape.dtype)
                 weight_ignored = paddle.multiply(mask, weight_gather_reshape)
                 weight_sum = paddle.sum(weight_ignored, name=name)
-                ret = out_sum / weight_sum
+                ret = out_sum / (weight_sum + (weight_sum == 0.0))
             return ret
         elif weight is not None:
             out_sum = paddle.sum(out, name=name)
             total_weight = paddle.sum(weight_gather_reshape)
-            return out_sum / total_weight
+            return out_sum / (total_weight + (total_weight == 0.0))
         else:
             return paddle.mean(out, name=name)
 
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 54824233f70762..20e3254638997c 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -22,16 +22,9 @@
 from ...fluid.initializer import Constant
 from ...fluid.param_attr import ParamAttr
 from ...fluid import core, dygraph_utils
+import numbers
 
-__all__ = [
-    'batch_norm',
-    #       'data_norm',
-    'instance_norm',
-    'layer_norm',
-    'local_response_norm',
-    'normalize',
-    #       'spectral_norm'
-]
+__all__ = []
 
 
 def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
@@ -223,24 +216,27 @@ def batch_norm(x,
 
     helper = LayerHelper('batch_norm', **locals())
 
-    dtype = x.dtype if x.dtype is not 'float16' else 'float32'
+    param_dtype = x.dtype if x.dtype is not 'float16' else 'float32'
     saved_mean = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+        dtype=param_dtype, stop_gradient=True)
     saved_variance = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
-    batch_norm_out = helper.create_variable_for_type_inference(dtype)
-    reserve_space = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True)
+        dtype=param_dtype, stop_gradient=True)
+    batch_norm_out = helper.create_variable_for_type_inference(x.dtype)
 
     outputs = {
         "Y": [batch_norm_out],
         "MeanOut": [running_mean],
         "VarianceOut": [running_var],
         "SavedMean": [saved_mean],
-        "SavedVariance": [saved_variance],
-        "ReserveSpace": [reserve_space]
+        "SavedVariance": [saved_variance]
     }
 
+    if training or trainable_statistics:
+        # reserve_space is only used for training.
+        reserve_space = helper.create_variable_for_type_inference(
+            dtype=x.dtype, stop_gradient=True)
+        outputs["ReserveSpace"] = [reserve_space]
+
     helper.append_op(
         type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
 
@@ -286,6 +282,14 @@ def layer_norm(x,
     """
     input_shape = list(x.shape)
     input_ndim = len(input_shape)
+    if isinstance(normalized_shape, numbers.Integral):
+        normalized_shape = [normalized_shape]
+    elif isinstance(normalized_shape, tuple):
+        normalized_shape = list(normalized_shape)
+    elif not isinstance(normalized_shape, list):
+        raise ValueError(
+            "`normalized_shape` should be int, list of ints or tuple of ints.")
+
     normalized_ndim = len(normalized_shape)
     begin_norm_axis = input_ndim - normalized_ndim
     if input_ndim < normalized_ndim or input_shape[
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 5f3642710ae0ad..1869ac15b17a3b 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -18,20 +18,7 @@
 from ...fluid.layers import utils, LayerHelper, unsqueeze, squeeze
 from ...fluid.data_feeder import check_type, check_variable_and_dtype
 
-__all__ = [
-    'avg_pool1d',
-    'avg_pool2d',
-    'avg_pool3d',
-    'max_pool1d',
-    'max_pool2d',
-    'max_pool3d',
-    'adaptive_avg_pool1d',
-    'adaptive_avg_pool2d',
-    'adaptive_avg_pool3d',
-    'adaptive_max_pool1d',
-    'adaptive_max_pool2d',
-    'adaptive_max_pool3d',
-]
+__all__ = []
 
 
 def _is_list_or_tuple(input):
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 032d5b47eda077..a2218a6e1aa69b 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -19,42 +19,7 @@
 from ...fluid import dygraph_utils
 import numpy as np
 
-# TODO: define specitial functions used in computer vision task  
-# from ...fluid.layers import affine_channel  #DEFINE_ALIAS
-# from ...fluid.layers import anchor_generator  #DEFINE_ALIAS
-# from ...fluid.layers import bipartite_match  #DEFINE_ALIAS
-# from ...fluid.layers import box_clip  #DEFINE_ALIAS
-# from ...fluid.layers import box_coder  #DEFINE_ALIAS
-# from ...fluid.layers import box_decoder_and_assign  #DEFINE_ALIAS
-# from ...fluid.layers import collect_fpn_proposals  #DEFINE_ALIAS
-# from ...fluid.layers import deformable_roi_pooling  #DEFINE_ALIAS
-# from ...fluid.layers import density_prior_box  #DEFINE_ALIAS
-# from ...fluid.layers import detection_output  #DEFINE_ALIAS
-# from ...fluid.layers import distribute_fpn_proposals  #DEFINE_ALIAS
-# from ...fluid.layers import generate_mask_labels  #DEFINE_ALIAS
-# from ...fluid.layers import generate_proposal_labels  #DEFINE_ALIAS
-# from ...fluid.layers import generate_proposals  #DEFINE_ALIAS
-# from ...fluid.layers import image_resize  #DEFINE_ALIAS
-# from ...fluid.layers import prior_box  #DEFINE_ALIAS
-# from ...fluid.layers import prroi_pool  #DEFINE_ALIAS
-# from ...fluid.layers import psroi_pool  #DEFINE_ALIAS
-# from ...fluid.layers import resize_bilinear  #DEFINE_ALIAS
-# from ...fluid.layers import resize_nearest  #DEFINE_ALIAS
-# from ...fluid.layers import resize_trilinear  #DEFINE_ALIAS
-# from ...fluid.layers import roi_align  #DEFINE_ALIAS
-# from ...fluid.layers import roi_pool  #DEFINE_ALIAS
-# from ...fluid.layers import space_to_depth  #DEFINE_ALIAS
-# from ...fluid.layers import yolo_box  #DEFINE_ALIAS
-# from ...fluid.layers import yolov3_loss  #DEFINE_ALIAS
-# from ...fluid.layers import fsp_matrix  #DEFINE_ALIAS
-# from ...fluid.layers import image_resize_short  #DEFINE_ALIAS
-# from ...fluid.layers import pixel_shuffle  #DEFINE_ALIAS
-# from ...fluid.layers import retinanet_detection_output  #DEFINE_ALIAS
-# from ...fluid.layers import retinanet_target_assign  #DEFINE_ALIAS
-# from ...fluid.layers import roi_perspective_transform  #DEFINE_ALIAS
-# from ...fluid.layers import shuffle_channel  #DEFINE_ALIAS
-
-__all__ = ['affine_grid', 'grid_sample', 'pixel_shuffle']
+__all__ = []
 
 
 def affine_grid(theta, out_shape, align_corners=True, name=None):
@@ -300,8 +265,9 @@ def grid_sample(x,
 
     cudnn_version = get_cudnn_version()
     use_cudnn = False
-    if (cudnn_version is not None
-        ) and align_corners and mode == 'bilinear' and padding_mode == 'zeros':
+    if not core.is_compiled_with_rocm() and (
+            cudnn_version is not None
+    ) and align_corners and mode == 'bilinear' and padding_mode == 'zeros':
         use_cudnn = True
         # CUDNN always computes gradients for all inputs
         x.stop_gradient = False
diff --git a/python/paddle/nn/initializer/__init__.py b/python/paddle/nn/initializer/__init__.py
index c128a1b401b2d8..03e91f80dd139c 100644
--- a/python/paddle/nn/initializer/__init__.py
+++ b/python/paddle/nn/initializer/__init__.py
@@ -13,36 +13,34 @@
 # limitations under the License.
 
 # TODO: define the initializers to create a Parameter in neural network
-from ...fluid.initializer import Bilinear  #DEFINE_ALIAS
-from ...fluid.initializer import set_global_initializer  #DEFINE_ALIAS
+from ...fluid.initializer import Bilinear  # noqa: F401
+from ...fluid.initializer import set_global_initializer  # noqa: F401
 
-from . import constant
-from .constant import Constant  #DEFINE_ALIAS
+from .constant import Constant  # noqa: F401
 
-from . import kaiming
-from .kaiming import KaimingNormal  #DEFINE_ALIAS
-from .kaiming import KaimingUniform  #DEFINE_ALIAS
+from .kaiming import KaimingNormal  # noqa: F401
+from .kaiming import KaimingUniform  # noqa: F401
 
-__all__ = ['Bilinear', 'set_global_initializer']
+from .xavier import XavierNormal  # noqa: F401
+from .xavier import XavierUniform  # noqa: F401
 
-__all__ += constant.__all__
-__all__ += kaiming.__all__
+from .assign import Assign  # noqa: F401
 
-from . import xavier
-from .xavier import XavierNormal  #DEFINE_ALIAS
-from .xavier import XavierUniform  #DEFINE_ALIAS
+from .normal import Normal  # noqa: F401
+from .normal import TruncatedNormal  # noqa: F401
 
-from . import assign
-from .assign import Assign  #DEFINE_ALIAS
+from .uniform import Uniform  # noqa: F401
 
-from . import normal
-from .normal import Normal  #DEFINE_ALIAS
-from .normal import TruncatedNormal  #DEFINE_ALIAS
-
-from . import uniform
-from .uniform import Uniform  #DEFINE_ALIAS
-
-__all__ += xavier.__all__
-__all__ += assign.__all__
-__all__ += normal.__all__
-__all__ += uniform.__all__
+__all__ = [     #noqa
+           'Bilinear',
+           'Constant',
+           'KaimingUniform',
+           'KaimingNormal',
+           'XavierNormal',
+           'XavierUniform',
+           'Assign',
+           'Normal',
+           'TruncatedNormal',
+           'Uniform',
+           'set_global_initializer'
+]
diff --git a/python/paddle/nn/initializer/assign.py b/python/paddle/nn/initializer/assign.py
index a33301230e89e1..13a70a179ffe38 100644
--- a/python/paddle/nn/initializer/assign.py
+++ b/python/paddle/nn/initializer/assign.py
@@ -19,14 +19,14 @@
 from ...fluid.data_feeder import check_type
 from ...fluid.initializer import NumpyArrayInitializer
 
-__all__ = ['Assign']
+__all__ = []
 
 
 class Assign(NumpyArrayInitializer):
     """Init an parameter with a numpy array, list, or tensor.
 
     Args:
-        value (Tensor|numpy.ndarray|list): numpy array, list, or tensor to initialize the parameter.
+        value (Tensor|numpy.ndarray|list|tuple): numpy array, list, tuple, or tensor to initialize the parameter.
         name(str, optional): The default value is None. Normally there is no need for user to set this
             property. For more information, please refer to :ref:`api_guide_Name`.
 
@@ -87,10 +87,10 @@ class Assign(NumpyArrayInitializer):
 
     def __init__(self, value, name=None):
         import numpy
-        check_type(value, 'value', (numpy.ndarray, list, framework.Variable),
-                   'Assign')
+        check_type(value, 'value',
+                   (numpy.ndarray, list, tuple, framework.Variable), 'Assign')
 
-        if (isinstance(value, list)):
+        if (isinstance(value, (list, tuple))):
             value = numpy.array(value)
 
         # TODO: value is already is a tensor, accounting efficiency maybe it does not need to convert tensor to numpy data and then initialized.
diff --git a/python/paddle/nn/initializer/constant.py b/python/paddle/nn/initializer/constant.py
index 6d21ddae0d16b5..292eaff362b407 100644
--- a/python/paddle/nn/initializer/constant.py
+++ b/python/paddle/nn/initializer/constant.py
@@ -15,7 +15,7 @@
 # TODO: define the initializers of Constant in neural network
 from ...fluid.initializer import ConstantInitializer
 
-__all__ = ['Constant']
+__all__ = []
 
 
 class Constant(ConstantInitializer):
diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py
index 7e2b6f787f8531..f0847c85237b25 100644
--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -15,7 +15,7 @@
 # TODO: define the initializers of Kaiming functions in neural network
 from ...fluid.initializer import MSRAInitializer
 
-__all__ = ['KaimingUniform', 'KaimingNormal']
+__all__ = []
 
 
 class KaimingNormal(MSRAInitializer):
diff --git a/python/paddle/nn/initializer/normal.py b/python/paddle/nn/initializer/normal.py
index a572d0e2c92160..6fee5058057cb0 100644
--- a/python/paddle/nn/initializer/normal.py
+++ b/python/paddle/nn/initializer/normal.py
@@ -15,7 +15,7 @@
 from ...fluid.initializer import NormalInitializer
 from ...fluid.initializer import TruncatedNormalInitializer
 
-__all__ = ['Normal', 'TruncatedNormal']
+__all__ = []
 
 
 class Normal(NormalInitializer):
diff --git a/python/paddle/nn/initializer/uniform.py b/python/paddle/nn/initializer/uniform.py
index a5d7d34efcf664..cac03b59480712 100644
--- a/python/paddle/nn/initializer/uniform.py
+++ b/python/paddle/nn/initializer/uniform.py
@@ -14,7 +14,7 @@
 
 from ...fluid.initializer import UniformInitializer
 
-__all__ = ['Uniform']
+__all__ = []
 
 
 class Uniform(UniformInitializer):
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index 821a6984753105..f2d5593032f64d 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -14,7 +14,7 @@
 
 from ...fluid.initializer import XavierInitializer
 
-__all__ = ['XavierNormal', 'XavierUniform']
+__all__ = []
 
 
 class XavierNormal(XavierInitializer):
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 17c4ca5c5d11d2..10c2b1e3056f15 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -14,90 +14,72 @@
 
 # TODO: define activation functions of neural network
 
-from . import activation
-from . import loss
-from . import conv
-from . import activation
-from . import norm
-from . import rnn
-from . import vision
-from . import distance
-from . import transformer
-from . import container
+from . import rnn  # noqa: F401
+from . import transformer  # noqa: F401
+from . import container  # noqa: F401
 
-from .activation import *
-from .loss import *
-from .conv import *
-from .activation import *
-from .norm import *
-from .rnn import *
-from .vision import *
+from .activation import PReLU  # noqa: F401
+from .activation import ReLU  # noqa: F401
+from .activation import ReLU6  # noqa: F401
+from .activation import LeakyReLU  # noqa: F401
+from .activation import Sigmoid  # noqa: F401
+from .activation import Softmax  # noqa: F401
+from .activation import LogSoftmax  # noqa: F401
+from .common import Bilinear  # noqa: F401
+from .common import Pad1D  # noqa: F401
+from .common import Pad2D  # noqa: F401
+from .common import Pad3D  # noqa: F401
+from .common import CosineSimilarity  # noqa: F401
+from .common import Embedding  # noqa: F401
+from .common import Linear  # noqa: F401
+from .common import Flatten  # noqa: F401
+from .common import Upsample  # noqa: F401
+from .common import Dropout  # noqa: F401
+from .common import Dropout2D  # noqa: F401
+from .common import Dropout3D  # noqa: F401
+from .common import AlphaDropout  # noqa: F401
+from .common import Upsample  # noqa: F401
+from .common import UpsamplingBilinear2D  # noqa: F401
+from .common import UpsamplingNearest2D  # noqa: F401
+from .pooling import AvgPool1D  # noqa: F401
+from .pooling import AvgPool2D  # noqa: F401
+from .pooling import AvgPool3D  # noqa: F401
+from .pooling import MaxPool1D  # noqa: F401
+from .pooling import MaxPool2D  # noqa: F401
+from .pooling import MaxPool3D  # noqa: F401
+from .pooling import AdaptiveAvgPool1D  # noqa: F401
+from .pooling import AdaptiveAvgPool2D  # noqa: F401
+from .pooling import AdaptiveAvgPool3D  # noqa: F401
+from .pooling import AdaptiveMaxPool1D  # noqa: F401
+from .pooling import AdaptiveMaxPool2D  # noqa: F401
+from .pooling import AdaptiveMaxPool3D  # noqa: F401
+from .conv import Conv1D  # noqa: F401
+from .conv import Conv2D  # noqa: F401
+from .conv import Conv3D  # noqa: F401
+from .conv import Conv1DTranspose  # noqa: F401
+from .conv import Conv2DTranspose  # noqa: F401
+from .conv import Conv3DTranspose  # noqa: F401
+from .loss import BCEWithLogitsLoss  # noqa: F401
+from .loss import CrossEntropyLoss  # noqa: F401
+from .loss import MSELoss  # noqa: F401
+from .loss import L1Loss  # noqa: F401
+from .loss import NLLLoss  # noqa: F401
+from .loss import BCELoss  # noqa: F401
+from .loss import KLDivLoss  # noqa: F401
+from .loss import MarginRankingLoss  # noqa: F401
+from .loss import CTCLoss  # noqa: F401
+from .loss import SmoothL1Loss  # noqa: F401
+from .norm import BatchNorm1D  # noqa: F401
+from .norm import BatchNorm2D  # noqa: F401
+from .norm import BatchNorm3D  # noqa: F401
+from .norm import SyncBatchNorm  # noqa: F401
+from .norm import GroupNorm  # noqa: F401
+from .norm import LayerNorm  # noqa: F401
+from .norm import SpectralNorm  # noqa: F401
+from .norm import LocalResponseNorm  # noqa: F401
 
-from .transformer import *
-from .activation import PReLU  #DEFINE_ALIAS
-from .activation import ReLU  #DEFINE_ALIAS
-from .activation import LeakyReLU  #DEFINE_ALIAS
-from .activation import Sigmoid  #DEFINE_ALIAS
-from .activation import Softmax  #DEFINE_ALIAS
-from .activation import LogSoftmax  #DEFINE_ALIAS
-from .common import Bilinear  #DEFINE_ALIAS
-from .common import Pad1D  #DEFINE_ALIAS
-from .common import Pad2D  #DEFINE_ALIAS
-from .common import Pad3D  #DEFINE_ALIAS
-from .common import CosineSimilarity  #DEFINE_ALIAS
-from .common import Embedding  #DEFINE_ALIAS
-from .common import Linear  #DEFINE_ALIAS
-from .common import Flatten  #DEFINE_ALIAS
-from .common import Upsample  #DEFINE_ALIAS
-from .common import Dropout  #DEFINE_ALIAS
-from .common import Dropout2D  #DEFINE_ALIAS
-from .common import Dropout3D  #DEFINE_ALIAS
-from .common import AlphaDropout  #DEFINE_ALIAS
-from .common import Upsample  #DEFINE_ALIAS
-from .common import UpsamplingBilinear2D  #DEFINE_ALIAS
-from .common import UpsamplingNearest2D  #DEFINE_ALIAS
-from .pooling import AvgPool1D  #DEFINE_ALIAS
-from .pooling import AvgPool2D  #DEFINE_ALIAS
-from .pooling import AvgPool3D  #DEFINE_ALIAS
-from .pooling import MaxPool1D  #DEFINE_ALIAS
-from .pooling import MaxPool2D  #DEFINE_ALIAS
-from .pooling import MaxPool3D  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool1D  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool2D  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool3D  #DEFINE_ALIAS
-from .pooling import AdaptiveMaxPool1D  #DEFINE_ALIAS
-from .pooling import AdaptiveMaxPool2D  #DEFINE_ALIAS
-from .pooling import AdaptiveMaxPool3D  #DEFINE_ALIAS
-from .conv import Conv1D  #DEFINE_ALIAS
-from .conv import Conv2D  #DEFINE_ALIAS
-from .conv import Conv3D  #DEFINE_ALIAS
-from .conv import Conv1DTranspose  #DEFINE_ALIAS
-from .conv import Conv2DTranspose  #DEFINE_ALIAS
-from .conv import Conv3DTranspose  #DEFINE_ALIAS
-# from .conv import TreeConv        #DEFINE_ALIAS
-# from .conv import Conv1D        #DEFINE_ALIAS
-# from .loss import NCELoss        #DEFINE_ALIAS
-from .loss import BCEWithLogitsLoss  #DEFINE_ALIAS
-from .loss import CrossEntropyLoss  #DEFINE_ALIAS
-from .loss import MSELoss  #DEFINE_ALIAS
-from .loss import L1Loss  #DEFINE_ALIAS
-from .loss import NLLLoss  #DEFINE_ALIAS
-from .loss import BCELoss  #DEFINE_ALIAS
-from .loss import KLDivLoss  #DEFINE_ALIAS
-from .loss import MarginRankingLoss  #DEFINE_ALIAS
-from .loss import CTCLoss  #DEFINE_ALIAS
-from .loss import SmoothL1Loss  #DEFINE_ALIAS
-from .norm import BatchNorm  #DEFINE_ALIAS
-from .norm import SyncBatchNorm  #DEFINE_ALIAS
-from .norm import GroupNorm  #DEFINE_ALIAS
-from .norm import LayerNorm  #DEFINE_ALIAS
-from .norm import SpectralNorm  #DEFINE_ALIAS
-#from .norm import InstanceNorm  #DEFINE_ALIAS
-from .norm import LocalResponseNorm  #DEFINE_ALIAS
-# from .rnn import RNNCell        #DEFINE_ALIAS
-# from .rnn import GRUCell        #DEFINE_ALIAS
-# from .rnn import LSTMCell        #DEFINE_ALIAS
+from .vision import PixelShuffle  # noqa: F401
+from .distance import PairwiseDistance  # noqa: F401
+from .container import LayerDict  # noqa: F401
 
-from .vision import PixelShuffle  #DEFINE_ALIAS
-from .distance import PairwiseDistance  #DEFINE_ALIAS
-from .container import LayerDict  #DEFINE_ALIAS
+__all__ = []
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 69cdb7381716b5..d5b37144cfffed 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -14,32 +14,6 @@
 
 # TODO: define activation functions of neural network
 
-__all__ = [
-    'ELU',
-    'GELU',
-    'Hardshrink',
-    'Hardswish',
-    'Tanh',
-    'Hardtanh',
-    'PReLU',
-    'ReLU',
-    'ReLU6',
-    'SELU',
-    'LeakyReLU',
-    'Sigmoid',
-    'Hardsigmoid',
-    'Softmax',
-    'Softplus',
-    'Softshrink',
-    'Softsign',
-    'Swish',
-    'Tanhshrink',
-    'ThresholdedReLU',
-    'LogSigmoid',
-    'LogSoftmax',
-    'Maxout',
-]
-
 from ...fluid.dygraph import layers
 from ...fluid import core
 from ...fluid.framework import in_dygraph_mode
@@ -48,6 +22,8 @@
 from paddle.framework import get_default_dtype
 from .. import functional as F
 
+__all__ = []
+
 
 class ELU(layers.Layer):
     r"""
@@ -919,6 +895,44 @@ def extra_repr(self):
         return 'threshold={}{}'.format(self._threshold, name_str)
 
 
+class Silu(layers.Layer):
+    """
+    Silu Activation.
+    .. math::
+
+        Silu(x) = \frac{x}{1 + e^{-x}}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, or float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+            m = paddle.nn.Silu()
+            out = m(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
+    """
+
+    def __init__(self, name=None):
+        super(Silu, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.silu(x, self._name)
+
+    def extra_repr(self):
+        name_str = 'name={}'.format(self._name) if self._name else ''
+        return name_str
+
+
 class LogSigmoid(layers.Layer):
     r"""
     LogSigmoid Activation.
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 86a6fae0d6857f..f608f20feef55e 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -14,28 +14,13 @@
 
 # TODO: define the common classes to build a neural network
 import paddle
-from ...fluid.dygraph import Flatten  #DEFINE_ALIAS
+from ...fluid.dygraph import Flatten  # noqa: F401
 from ...fluid.dygraph import layers
 from ...fluid.framework import in_dygraph_mode
 from .. import functional as F
 from ...fluid.framework import _dygraph_tracer
 
-__all__ = [
-    'Embedding',
-    'Linear',
-    'Upsample',
-    'Pad1D',
-    'Pad2D',
-    'Pad3D',
-    'UpsamplingNearest2D',
-    'UpsamplingBilinear2D',
-    'CosineSimilarity',
-    'Dropout',
-    'Dropout2D',
-    'Dropout3D',
-    'Bilinear',
-    'AlphaDropout',
-]
+__all__ = []
 
 
 def _npairs(x, n):
@@ -299,7 +284,7 @@ class Upsample(layers.Layer):
         size (list|tuple|Tensor|None): Output shape of image resize
              layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
              when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
-             Default: None. If a list, each element can be an integer or a Tensor of shape: [1].
+             Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor , its dimensions size should be a 1.
         scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
              least one of :attr:`size` or :attr:`scale_factor` must be set.
@@ -418,7 +403,7 @@ class UpsamplingNearest2D(layers.Layer):
                           its data format is specified by :attr:`data_format`.
         size (list|tuple|Tensor|None): Output shape of image resize
              layer, the shape is (out_h, out_w) when input is a 4-D Tensor.
-             Default: None. If a list, each element can be an integer or a Tensor of shape: [1].
+             Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor , its dimensions size should be a 1.
         scale_factor (float|int|list|tuple|Tensor|None): The multiplier for the input height or width. At
              least one of :attr:`size` or :attr:`scale_factor` must be set.
@@ -505,7 +490,7 @@ class UpsamplingBilinear2D(layers.Layer):
                           its data format is specified by :attr:`data_format`.
         size (list|tuple|Tensor|None): Output shape of image resize
              layer, the shape is (out_h, out_w) when input is a 4-D Tensor.
-             Default: None. If a list, each element can be an integer or a Tensor  of shape: [1].
+             Default: None. If a list/tuple, each element can be an integer or a Tensor  of shape: [1].
              If a Tensor , its dimensions size should be a 1.
         scale_factor (float|int|list|tuple|Tensor|None): The multiplier for the input height or width. At
              least one of :attr:`size` or :attr:`scale_factor` must be set.
@@ -679,8 +664,8 @@ class Dropout(layers.Layer):
     In dygraph mode, please use ``eval()`` to switch to evaluation mode, where dropout is disabled.
 
     Parameters:
-        p (float | int): Probability of setting units to zero. Default: 0.5
-        axis (int | list): The axis along which the dropout is performed. Default None.
+        p (float|int): Probability of setting units to zero. Default: 0.5
+        axis (int|list|tuple): The axis along which the dropout is performed. Default None.
         mode(str, optional): ['upscale_in_train'(default) | 'downscale_in_infer']
 
                                1. upscale_in_train(default), upscale the output at training time
@@ -1380,3 +1365,73 @@ def extra_repr(self):
         if self._name is not None:
             main_str += ', name={_name}'
         return main_str.format(**self.__dict__)
+
+
+class Unfold(layers.Layer):
+    """
+    This op returns a col buffer of sliding local blocks of input x, also known
+    as im2col for batched 2D image tensors. For each block under the convolution filter,
+    all element will be rearranged as a column. While the convolution filter sliding over
+    the input feature map, a series of such columns will be formed.
+
+    For each input :math:`x` with shape [N, C, H, W], the output shape [N, Cout, Lout]
+    can be calculated as following.
+
+    See ``paddle.nn.functional.unfold`` for more details.
+
+    
+    Parameters:
+        kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
+                                  or an integer k treated as [k, k].
+        strides(int|list):        The strides, should be [stride_h, stride_w]
+                                  or an integer stride treated as [sride, stride].
+                                  For default, strides will be [1, 1].
+        paddings(int|list):       The paddings of each dimension, should be
+                                  [padding_top, padding_left, padding_bottom, padding_right]
+                                  or [padding_h, padding_w] or an integer padding.
+                                  If [padding_h, padding_w] was given, it will expanded to
+                                  [padding_h, padding_w, padding_h, padding_w]. If an integer
+                                  padding was given, [padding, padding, padding, padding] will
+                                  be used. For default, paddings will be [0, 0, 0, 0]
+        dilations(int|list):      the dilations of convolution kernel, should be
+                                  [dilation_h, dilation_w], or an integer dilation treated as
+                                  [dilation, dilation]. For default, it will be [1, 1].
+        name(str, optional): The default value is None.
+                             Normally there is no need for user to set this property.
+                             For more information, please refer to :ref:`api_guide_Name`
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            x = paddle.randn((100,3,224,224))
+            unfold = nn.Unfold(kernel_sizes=[3, 3])
+            result = unfold(x)
+            print(result)
+   """
+
+    def __init__(self,
+                 kernel_sizes,
+                 dilations=1,
+                 paddings=0,
+                 strides=1,
+                 name=None):
+        super(Unfold, self).__init__()
+
+        self.kernel_sizes = kernel_sizes
+        self.dilations = dilations
+        self.paddings = paddings
+        self.strides = strides
+        self.name = name
+
+    def forward(self, input):
+        return F.unfold(input, self.kernel_sizes, self.dilations, self.paddings,
+                        self.strides, self.name)
+
+    def extra_repr(self):
+        name_str = ', name={}'.format(self.name) if self.name else ''
+        return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.\
+                format(self.kernel_sizes, self.dilations, self.paddings, self.strides, name_str)
diff --git a/python/paddle/nn/layer/container.py b/python/paddle/nn/layer/container.py
index db317839ae818a..ad41535f44ad6a 100644
--- a/python/paddle/nn/layer/container.py
+++ b/python/paddle/nn/layer/container.py
@@ -16,7 +16,7 @@
 from ...fluid.dygraph.layers import Layer
 from six.moves import collections_abc
 
-__all__ = ['LayerDict', ]
+__all__ = []
 
 
 class LayerDict(Layer):
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 55f5b823e35f4a..2de065d62a4f8c 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -14,15 +14,6 @@
 
 # TODO: define classes of convolutional neural network
 
-__all__ = [
-    'Conv1D',
-    'Conv2D',
-    'Conv3D',
-    'Conv1DTranspose',
-    'Conv2DTranspose',
-    'Conv3DTranspose',
-]
-
 import numpy as np
 
 from ...fluid import get_flags
@@ -34,6 +25,8 @@
 from ...fluid.layers import utils
 from ..functional.conv import _update_padding_nd
 
+__all__ = []
+
 
 def _get_default_param_initializer(num_channels, filter_size):
     filter_elem_num = num_channels * np.prod(filter_size)
@@ -199,7 +192,7 @@ class Conv1D(_ConvNd):
 
     .. math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -226,22 +219,22 @@ class Conv1D(_ConvNd):
 
         .. math::
 
-            L_{out}&= \\frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1
+            L_{out}&= \frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1 \\
 
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of filter. It is as same as the output
             feature map.
-        kernel_size (int|tuple|list): The filter size. If kernel_size is a tuple,
+        kernel_size (int|tuple|list): The filter size. If kernel_size is a tuple/list,
             it must contain one integer, (kernel_size).
-        stride (int|tuple|list, optional): The stride size. If stride is a tuple, it must
+        stride (int|tuple|list, optional): The stride size. If stride is a tuple/list, it must
             contain one integer, (stride_size). Default: 1.
         padding(int|str|tuple|list, optional): The size of zeros to be padded. It must be in one of the following forms.
             1. a string in ['valid', 'same'].
             2. an int, which means the feature map is zero paded by size of `padding` on both sides.
             3. a list[int] or tuple[int] whose length is 1, which means the feature map is zero paded by size of `padding[0]` on both sides.
             The default value is 0.
-        dilation (int|tuple|list, optional): The dilation size. If dilation is a tuple, it must
+        dilation (int|tuple|list, optional): The dilation size. If dilation is a tuple/list, it must
             contain one integer, (dilation_size). Default: 1.
         groups (int, optional): The groups number of the conv2d Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
@@ -258,7 +251,7 @@ class Conv1D(_ConvNd):
             of conv1d. If it is set to None or one attribute of ParamAttr, conv1d
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+            and the :math:`std` is :math:`(\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
         bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv1d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv1d
@@ -368,7 +361,7 @@ class Conv1DTranspose(_ConvNd):
 
     .. math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -404,18 +397,18 @@ class Conv1DTranspose(_ConvNd):
           so for conv1d_transpose, when stride > 1, input shape maps multiple output shape.
           If output_size is None, :math:`L_{out} = L^\prime_{out}`;
           else, the :math:`L_{out}` of the output size must between :math:`L^\prime_{out}`
-          and :math:`L^\prime_{out} + stride`. conv1d_transpose can compute the kernel size automatically.
+          and :math:`L^\prime_{out} + stride`.
 
     Args:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of the filter. It is as same as the output
             feature map.
-        kernel_size(int|tuple|list, optional): The filter size. If kernel_size is a tuple,
+        kernel_size(int|tuple|list, optional): The filter size. If kernel_size is a tuple/list,
             it must contain one integers, (kernel_size). None if
             use output size to calculate kernel_size. Default: None. kernel_size and
             output_size should not be None at the same time.
         stride(int|tuple|list, optional): The stride size. It means the stride in transposed convolution.
-            If stride is a tuple, it must contain one integer, (stride_size).
+            If stride is a tuple/list, it must contain one integer, (stride_size).
             Default: stride = 1.
         padding(int|list|str|tuple, optional): The padding size. The padding argument effectively adds
              `dilation * (kernel - 1)` amount of zero-padding on both sides of input. If `padding` is a
@@ -423,7 +416,7 @@ class Conv1DTranspose(_ConvNd):
              If `padding` is a tuple or list, it could be in two forms:
              `[pad]` or `[pad_left, pad_right]`. Default: padding = 0.
         output_padding(int|list|tuple, optional): The count of zeros to be added to tail of each dimension.
-             If it is a tuple, it must contain one integer. Default: 0.
+             If it is a tuple/list, it must contain one integer. Default: 0.
         groups(int, optional): The groups number of the Conv2D transpose layer. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
@@ -432,7 +425,7 @@ class Conv1DTranspose(_ConvNd):
             Default: groups = 1.
         bias(bool, optional): Whether to use bias. Default: True.
         dilation(int|tuple|list, optional): The dilation size. It means the spacing between the kernel points.
-            If dilation is a tuple, it must contain one integer, (dilation_size).
+            If dilation is a tuple/list, it must contain one integer, (dilation_size).
             Default: dilation = 1.
         weight_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
             of conv1d_transpose. If it is set to None or one attribute of ParamAttr, conv1d_transpose
@@ -451,7 +444,7 @@ class Conv1DTranspose(_ConvNd):
     Shape:
 
         - x(Tensor): 3-D tensor with shape (batch, in_channels, length) when data_format is "NCL" or shape (batch, length, in_channels) when data_format is "NLC".
-        - output_size(int|tuple|list, optional): The output image size. If output size is a tuple, it must contain one integer, (feature_length). None if use kernel_size, padding, output_padding and stride to calculate output_size. If output_size and kernel_size are specified at the same time, They should follow the formula above. Default: None. output_size and kernel_size should not be None at the same time.
+        - output_size(int|tuple|list, optional): The output image size. If output size is a tuple/list, it must contain one integer, (feature_length). None if use kernel_size, padding, output_padding and stride to calculate output_size. If output_size and kernel_size are specified at the same time, They should follow the formula above. Default: None. output_size and kernel_size should not be None at the same time.
         - output(Tensor): 3-D tensor with same shape as input x.
 
     Examples:
@@ -540,7 +533,7 @@ class Conv2D(_ConvNd):
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -555,7 +548,7 @@ class Conv2D(_ConvNd):
         in_channels(int): The number of input channels in the input image.
         out_channels(int): The number of output channels produced by the convolution.
         kernel_size(int|list|tuple, optional): The size of the convolving kernel.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain three integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
@@ -565,7 +558,7 @@ class Conv2D(_ConvNd):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
         groups(int, optional): The groups number of the Conv3D Layer. According to grouped
@@ -578,7 +571,7 @@ class Conv2D(_ConvNd):
             of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
             will create ParamAttr as param_attr. If it is set to None, the parameter
             is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
-            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+            :math:`(\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d
@@ -603,9 +596,9 @@ class Conv2D(_ConvNd):
 
         ..  math::
 
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+           H_{out}&= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
 
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+           W_{out}&= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
 
     Examples:
 
@@ -696,7 +689,7 @@ class Conv2DTranspose(_ConvNd):
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -710,10 +703,10 @@ class Conv2DTranspose(_ConvNd):
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of channels produced by the convolution.
-        kernel_size(int|list|uple): The kernel size. If kernel_size is a tuple,
+        kernel_size(int|list|tuple): The kernel size. If kernel_size is a list/tuple,
             it must contain two integers, (kernel_size_H, kernel_size_W).
             Otherwise, the kernel will be a square.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain two integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride. Default: 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
@@ -725,7 +718,7 @@ class Conv2DTranspose(_ConvNd):
             The default value is 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
             of each dimension in the output shape. Default: 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: 1.
         groups(int, optional): The groups number of the Conv2D transpose layer. Inspired by
@@ -851,7 +844,7 @@ class Conv3D(_ConvNd):
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     In the above equation:
 
@@ -866,7 +859,7 @@ class Conv3D(_ConvNd):
         in_channels(int): The number of input channels in the input image.
         out_channels(int): The number of output channels produced by the convolution.
         kernel_size(int|list|tuple, optional): The size of the convolving kernel.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
             stride_D = stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
@@ -876,7 +869,7 @@ class Conv3D(_ConvNd):
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
             The default value is 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
         groups(int, optional): The groups number of the Conv3D Layer. According to grouped
@@ -889,7 +882,7 @@ class Conv3D(_ConvNd):
             of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as param_attr. If it is set to None, the parameter
             is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
-            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+            :math:`(\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv3d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv3d
@@ -914,11 +907,11 @@ class Conv3D(_ConvNd):
 
         ..  math::
 
-           D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+           D_{out}&= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
 
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+           H_{out}&= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
 
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (kernel\_size[2] - 1) + 1))}{strides[2]} + 1
+           W_{out}&= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (kernel\_size[2] - 1) + 1))}{strides[2]} + 1
 
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
@@ -1010,7 +1003,7 @@ class Conv3DTranspose(_ConvNd):
     
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     In the above equation:
 
@@ -1037,11 +1030,11 @@ class Conv3DTranspose(_ConvNd):
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of channels produced by the convolution.
-        kernel_size(int|list|tuple): The kernel size. If kernel_size is a tuple,
+        kernel_size(int|list|tuple): The kernel size. If kernel_size is a list/tuple,
             it must contain three integers, (kernel_size_D, kernel_size_H, kernel_size_W).
             Otherwise, the kernel will be a square.
         stride(int|list|tuple, optional): The stride size. It means the stride in transposed convolution. 
-            If stride is a tuple, it must contain three integers, (stride_depth, stride_height, 
+            If stride is a list/tuple, it must contain three integers, (stride_depth, stride_height, 
             stride_width). Otherwise, stride_depth = stride_height = stride_width = stride. 
             The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
@@ -1053,7 +1046,7 @@ class Conv3DTranspose(_ConvNd):
             The default value is 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
             of each dimension in the output shape. Default: 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
         groups(int, optional): The groups number of the Conv3D transpose layer. Inspired by
@@ -1071,11 +1064,6 @@ class Conv3DTranspose(_ConvNd):
             If it is set to None or one attribute of ParamAttr, conv3d_transpose
             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
             is not set, the bias is initialized zero. The default value is None.
-        output_size(int|list|tuple, optional): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). None if use
-            filter_size, padding, and stride to calculate output_size.
-            if output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None.
         data_format(str, optional): Data format that specifies the layout of input.
             It can be "NCDHW" or "NDHWC". Default: "NCDHW".
 
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
index 72e0a1b2d6d200..77e3447ffda00e 100644
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = ['PairwiseDistance']
-
 import numpy as np
 
 import paddle
@@ -22,6 +20,8 @@
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid.layer_helper import LayerHelper
 
+__all__ = []
+
 
 class PairwiseDistance(layers.Layer):
     r"""
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index ad046b90417509..8f43eb8866b4bb 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -21,19 +21,7 @@
 from .. import functional as F
 from paddle.fluid.framework import core, in_dygraph_mode, _varbase_creator
 
-__all__ = [
-    'BCEWithLogitsLoss',
-    'CrossEntropyLoss',
-    'HSigmoidLoss',
-    'MSELoss',
-    'L1Loss',
-    'NLLLoss',
-    'BCELoss',
-    'KLDivLoss',
-    'MarginRankingLoss',
-    'CTCLoss',
-    'SmoothL1Loss',
-]
+__all__ = []
 
 
 class BCEWithLogitsLoss(fluid.dygraph.Layer):
@@ -295,7 +283,7 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
             Indicate whether compute softmax before cross_entropy.
             Default is ``True``.
 
-        - **name** (str，optional)
+        - **name** (str, optional)
 
             The name of the operator. Default is ``None`` .
             For more information, please refer to :ref:`api_guide_Name` .
@@ -318,7 +306,7 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
 
         - **label** (Tensor)
 
-            1. If soft_label=False，the shape is 
+            1. If soft_label=False, the shape is 
             :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
             the data type is int32, int64, float32, float64, where each value is [0, C-1].
 
@@ -1060,6 +1048,7 @@ class CTCLoss(fluid.dygraph.Layer):
         labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
         input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
         label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
+        norm_by_times (bool, default false) – Whether to normalize the gradients by the number of time-step, which is also the sequence’s length. There is no need to normalize the gradients if reduction mode is 'mean'.
 
     Returns:
         Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
@@ -1122,10 +1111,20 @@ def __init__(self, blank=0, reduction='mean'):
         self.blank = blank
         self.reduction = reduction
 
-    def forward(self, log_probs, labels, input_lengths, label_lengths):
-        return paddle.nn.functional.ctc_loss(log_probs, labels, input_lengths,
-                                             label_lengths, self.blank,
-                                             self.reduction)
+    def forward(self,
+                log_probs,
+                labels,
+                input_lengths,
+                label_lengths,
+                norm_by_times=False):
+        return paddle.nn.functional.ctc_loss(
+            log_probs,
+            labels,
+            input_lengths,
+            label_lengths,
+            self.blank,
+            self.reduction,
+            norm_by_times=norm_by_times)
 
 
 class SmoothL1Loss(fluid.dygraph.Layer):
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index a1cc41f39120ca..45640a6598e576 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -28,13 +28,10 @@
 # TODO: define normalization api  
 
 import six
-#from ...fluid.dygraph.nn import InstanceNorm
 
-from ...fluid.dygraph import BatchNorm  #DEFINE_ALIAS
-#from ...fluid.dygraph import GroupNorm  #DEFINE_ALIAS
+from ...fluid.dygraph import BatchNorm  # noqa: F401
 
-#from ...fluid.dygraph import LayerNorm  #DEFINE_ALIAS
-from ...fluid.dygraph import SpectralNorm  #DEFINE_ALIAS
+from ...fluid.dygraph import SpectralNorm  # noqa: F401
 
 from ...fluid.dygraph import layers
 from ...framework import get_default_dtype, set_default_dtype
@@ -53,11 +50,7 @@
 from ...fluid.dygraph.base import no_grad
 from .. import functional as F
 
-__all__ = [
-    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'BatchNorm1D',
-    'BatchNorm2D', 'BatchNorm3D', 'InstanceNorm1D', 'InstanceNorm2D',
-    'InstanceNorm3D', 'SyncBatchNorm', 'LocalResponseNorm'
-]
+__all__ = []
 
 
 class _InstanceNormBase(layers.Layer):
@@ -745,6 +738,19 @@ class BatchNorm1D(_BatchNormBase):
           print(batch_norm_out)
     """
 
+    def __init__(self,
+                 num_features,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCL',
+                 use_global_stats=None,
+                 name=None):
+        super(BatchNorm1D,
+              self).__init__(num_features, momentum, epsilon, weight_attr,
+                             bias_attr, data_format, use_global_stats, name)
+
     def _check_data_format(self, input):
         if input == 'NCHW' or input == 'NC' or input == 'NCL':
             self._data_format = 'NCHW'
@@ -924,6 +930,19 @@ class BatchNorm3D(_BatchNormBase):
           print(batch_norm_out)
     """
 
+    def __init__(self,
+                 num_features,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCDHW',
+                 use_global_stats=None,
+                 name=None):
+        super(BatchNorm3D,
+              self).__init__(num_features, momentum, epsilon, weight_attr,
+                             bias_attr, data_format, use_global_stats, name)
+
     def _check_data_format(self, input):
         if input == 'NCHW' or input == 'NCDHW':
             self._data_format = 'NCHW'
@@ -1036,7 +1055,7 @@ def __init__(self,
                  name=None):
         super(SyncBatchNorm,
               self).__init__(num_features, momentum, epsilon, weight_attr,
-                             bias_attr, data_format, name)
+                             bias_attr, data_format, None, name)
 
     def forward(self, x):
         # create output
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index cdb87a1cb39207..528572ee21b7cc 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -16,20 +16,7 @@
 from ...fluid.layer_helper import LayerHelper
 from .. import functional as F
 
-__all__ = [
-    'AvgPool1D',
-    'AvgPool2D',
-    'AvgPool3D',
-    'MaxPool1D',
-    'MaxPool2D',
-    'MaxPool3D',
-    'AdaptiveAvgPool1D',
-    'AdaptiveAvgPool2D',
-    'AdaptiveAvgPool3D',
-    'AdaptiveMaxPool1D',
-    'AdaptiveMaxPool2D',
-    'AdaptiveMaxPool3D',
-]
+__all__ = []
 
 
 class AvgPool1D(layers.Layer):
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 0cefb89340a7c0..de9b8cdbfce2a1 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -33,17 +33,7 @@
 from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
 from paddle.fluid.data_feeder import convert_dtype
 
-__all__ = [
-    'RNNCellBase',
-    'SimpleRNNCell',
-    'LSTMCell',
-    'GRUCell',
-    'RNN',
-    'BiRNN',
-    'SimpleRNN',
-    'LSTM',
-    'GRU',
-]
+__all__ = []
 
 
 def split_states(states, bidirectional=False, state_components=1):
@@ -447,7 +437,7 @@ class LSTMCell(RNNCellBase):
 
     Inputs:
         - **inputs** (Tensor): shape `[batch_size, input_size]`, the input, corresponding to :math:`x_t` in the formula.
-        - **states** (tuple, optional): a tuple of two tensors, each of shape `[batch_size, hidden_size]`, the previous hidden state, corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. When states is None, zero state is used. Defaults to None.
+        - **states** (list|tuple, optional): a list/tuple of two tensors, each of shape `[batch_size, hidden_size]`, the previous hidden state, corresponding to :math:`h_{t-1}, c_{t-1}` in the formula. When states is None, zero state is used. Defaults to None.
 
     Returns:
         - **outputs** (Tensor): shape `[batch_size, hidden_size]`, the output, corresponding to :math:`h_{t}` in the formula.
@@ -1251,7 +1241,7 @@ class LSTM(RNNBase):
 
     Inputs:
         - **inputs** (Tensor): the input sequence. If `time_major` is True, the shape is `[time_steps, batch_size, input_size]`, else, the shape is `[batch_size, time_steps, hidden_size]`.
-        - **initial_states** (tuple, optional): the initial state, a tuple of (h, c), the shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used.
+        - **initial_states** (list|tuple, optional): the initial state, a list/tuple of (h, c), the shape of each is `[num_layers * num_directions, batch_size, hidden_size]`. If initial_state is not given, zero initial states are used.
         - **sequence_length** (Tensor, optional): shape `[batch_size]`, dtype: int64 or int32. The valid lengths of input sequences. Defaults to None. If `sequence_length` is not None, the inputs are treated as padded sequences. In each input sequence, elements whos time step index are not less than the valid length are treated as paddings.
 
     Returns:
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 5aded4949e2d7a..891177532a4389 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -13,14 +13,6 @@
 # limitations under the License.
 
 # TODO: define the classes of Transformer neural network
-__all__ = [
-    'MultiHeadAttention',
-    'TransformerEncoderLayer',
-    'TransformerEncoder',
-    'TransformerDecoderLayer',
-    'TransformerDecoder',
-    'Transformer',
-]
 
 import copy
 import collections
@@ -36,6 +28,8 @@
 from ...fluid.param_attr import ParamAttr
 from ...fluid.data_feeder import convert_dtype
 
+__all__ = []
+
 
 def _convert_param_attr_to_list(param_attr, n):
     """
@@ -461,14 +455,14 @@ class TransformerEncoderLayer(Layer):
             normalization and post-precess includes dropout, residual connection.
             Otherwise, no pre-process and post-precess includes dropout, residual
             connection, layer normalization. Default False
-        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
-            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
+            If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for
             MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.
             Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
             Default: None, which means the default weight parameter property is used.
             See usage for details in :code:`ParamAttr` . 
-        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
-            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
+            If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for
             MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
             Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
             The `False` value means the corresponding layer would not have trainable
@@ -747,16 +741,16 @@ class TransformerDecoderLayer(Layer):
             normalization and post-precess includes dropout, residual connection.
             Otherwise, no pre-process and post-precess includes dropout, residual
             connection, layer normalization. Default False
-        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
-            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
+            If it is a list/tuple, `weight_attr[0]` would be used as `weight_attr` for
             self attention, `weight_attr[1]` would be used as `weight_attr` for
             cross attention, and `weight_attr[2]` would be used as `weight_attr`
             for linear in FFN. Otherwise, the three sub-layers all uses it as
             `weight_attr` to create parameters. Default: None, which means the
             default weight parameter property is used. See usage for details
             in :ref:`api_paddle_fluid_param_attr_ParamAttr` . 
-        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
-            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
+            If it is a list/tuple, `bias_attr[0]` would be used as `bias_attr` for
             self attention, `bias_attr[1]` would be used as `bias_attr` for
             cross attention, and `bias_attr[2]` would be used as `bias_attr`
             for linear in FFN. Otherwise, the three sub-layers all uses it as
@@ -1129,8 +1123,8 @@ class Transformer(Layer):
             normalization and post-precess includes dropout, residual connection.
             Otherwise, no pre-process and post-precess includes dropout, residual
             connection, layer normalization. Default False
-        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
-            If it is a tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3, 
+        weight_attr(ParamAttr|list|tuple, optional): To specify the weight parameter property.
+            If it is a list/tuple, the length of `weight_attr` could be 1, 2 or 3. If it is 3, 
             `weight_attr[0]` would be used as `weight_attr` for self attention, `weight_attr[1]` 
             would be used as `weight_attr` for cross attention of `TransformerDecoder`, 
             and `weight_attr[2]` would be used as `weight_attr` for linear in FFN. 
@@ -1142,8 +1136,8 @@ class Transformer(Layer):
             Default: None, which means the default weight parameter property is used. 
             See usage for details
             in :code:`ParamAttr` . 
-        bias_attr (ParamAttr|tuple|bool, optional): To specify the bias parameter property.
-            If it is a tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3, 
+        bias_attr (ParamAttr|list|tuple|bool, optional): To specify the bias parameter property.
+            If it is a list/tuple, the length of `bias_attr` could be 1, 2 or 3. If it is 3, 
             `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]` 
             would be used as `bias_attr` for cross attention of `TransformerDecoder`, 
             and `bias_attr[2]` would be used as `bias_attr` for linear in FFN. 
diff --git a/python/paddle/nn/layer/vision.py b/python/paddle/nn/layer/vision.py
index d9c948a848a939..e6d3af9a37b329 100644
--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py
@@ -17,7 +17,7 @@
 from ...fluid.dygraph import layers
 from .. import functional
 
-__all__ = ['PixelShuffle']
+__all__ = []
 
 
 class PixelShuffle(layers.Layer):
diff --git a/python/paddle/nn/utils/__init__.py b/python/paddle/nn/utils/__init__.py
index 6562ac35e1e318..b6801cfe3208dd 100644
--- a/python/paddle/nn/utils/__init__.py
+++ b/python/paddle/nn/utils/__init__.py
@@ -12,5 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import weight_norm_hook
-from .weight_norm_hook import weight_norm, remove_weight_norm
+from .spectral_norm_hook import spectral_norm
+from .weight_norm_hook import weight_norm, remove_weight_norm  # noqa: F401
+
+__all__ = [  #noqa
+    'weight_norm', 'remove_weight_norm', 'spectral_norm'
+]
diff --git a/python/paddle/nn/utils/spectral_norm_hook.py b/python/paddle/nn/utils/spectral_norm_hook.py
new file mode 100644
index 00000000000000..5ce9e0937d33d1
--- /dev/null
+++ b/python/paddle/nn/utils/spectral_norm_hook.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import numpy as np
+
+import paddle
+from ..layer.conv import Conv1DTranspose, Conv2DTranspose, Conv3DTranspose
+from ..layer.common import Linear
+from .. import functional as F
+
+__all__ = ['spectral_norm']
+
+
+def normal_(x, mean=0., std=1.):
+    temp_value = paddle.normal(mean, std, shape=x.shape)
+    x.set_value(temp_value)
+    return x
+
+
+class SpectralNorm(object):
+    def __init__(self, name='weight', n_power_iterations=1, dim=0, eps=1e-12):
+        self.name = name
+        self.dim = dim
+        if n_power_iterations <= 0:
+            raise ValueError('Expected n_power_iterations to be positive, but '
+                             'got n_power_iterations={}'.format(
+                                 n_power_iterations))
+        self.n_power_iterations = n_power_iterations
+        self.eps = eps
+
+    def reshape_weight_to_matrix(self, weight):
+        weight_mat = weight
+        if self.dim != 0:
+            # transpose dim to front
+            weight_mat = weight_mat.transpose([self.dim] + [
+                d for d in range(weight_mat.dim()) if d != self.dim
+            ])
+
+        height = weight_mat.shape[0]
+
+        return weight_mat.reshape([height, -1])
+
+    def compute_weight(self, layer, do_power_iteration):
+        weight = getattr(layer, self.name + '_orig')
+        u = getattr(layer, self.name + '_u')
+        v = getattr(layer, self.name + '_v')
+        weight_mat = self.reshape_weight_to_matrix(weight)
+
+        if do_power_iteration:
+            with paddle.no_grad():
+                for _ in range(self.n_power_iterations):
+                    v.set_value(
+                        F.normalize(
+                            paddle.matmul(
+                                weight_mat,
+                                u,
+                                transpose_x=True,
+                                transpose_y=False),
+                            axis=0,
+                            epsilon=self.eps, ))
+
+                    u.set_value(
+                        F.normalize(
+                            paddle.matmul(weight_mat, v),
+                            axis=0,
+                            epsilon=self.eps, ))
+                if self.n_power_iterations > 0:
+                    u = u.clone()
+                    v = v.clone()
+
+        sigma = paddle.dot(u, paddle.mv(weight_mat, v))
+        weight = weight / sigma
+        return weight
+
+    def __call__(self, layer, inputs):
+        setattr(
+            layer,
+            self.name,
+            self.compute_weight(
+                layer, do_power_iteration=layer.training))
+
+    @staticmethod
+    def apply(layer, name, n_power_iterations, dim, eps):
+        for k, hook in layer._forward_pre_hooks.items():
+            if isinstance(hook, SpectralNorm) and hook.name == name:
+                raise RuntimeError("Cannot register two spectral_norm hooks on "
+                                   "the same parameter {}".format(name))
+
+        fn = SpectralNorm(name, n_power_iterations, dim, eps)
+        weight = layer._parameters[name]
+
+        with paddle.no_grad():
+            weight_mat = fn.reshape_weight_to_matrix(weight)
+            h, w = weight_mat.shape
+
+            # randomly initialize u and v
+            u = layer.create_parameter([h])
+            u = normal_(u, 0., 1.)
+            v = layer.create_parameter([w])
+            v = normal_(v, 0., 1.)
+            u = F.normalize(u, axis=0, epsilon=fn.eps)
+            v = F.normalize(v, axis=0, epsilon=fn.eps)
+
+        # delete fn.name form parameters, otherwise you can not set attribute
+        del layer._parameters[fn.name]
+        layer.add_parameter(fn.name + "_orig", weight)
+        # still need to assign weight back as fn.name because all sorts of
+        # things may assume that it exists, e.g., when initializing weights.
+        # However, we can't directly assign as it could be an Parameter and
+        # gets added as a parameter. Instead, we register weight * 1.0 as a plain
+        # attribute.
+        setattr(layer, fn.name, weight * 1.0)
+        layer.register_buffer(fn.name + "_u", u)
+        layer.register_buffer(fn.name + "_v", v)
+        layer.register_forward_pre_hook(fn)
+        return fn
+
+
+def spectral_norm(layer,
+                  name='weight',
+                  n_power_iterations=1,
+                  eps=1e-12,
+                  dim=None):
+    r"""
+    This spectral_norm layer applies spectral normalization to a parameter according to the 
+    following Calculation:
+
+    Step 1:
+    Generate vector U in shape of [H], and V in shape of [W].
+    While H is the :attr:`dim` th dimension of the input weights,
+    and W is the product result of remaining dimensions.
+
+    Step 2:
+    :attr:`power_iters` should be a positive integer, do following
+    calculations with U and V for :attr:`power_iters` rounds.
+
+    .. math::
+
+        \mathbf{v} := \\frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2}
+
+        \mathbf{u} := \\frac{\mathbf{W} \mathbf{v}}{\|\mathbf{W} \mathbf{v}\|_2}
+
+    Step 3:
+    Calculate :math:`\sigma(\mathbf{W})` and normalize weight values.
+
+    .. math::
+
+        \sigma(\mathbf{W}) = \mathbf{u}^{T} \mathbf{W} \mathbf{v}
+
+        \mathbf{W} = \\frac{\mathbf{W}}{\sigma(\mathbf{W})}
+
+
+    Refer to `Spectral Normalization <https://arxiv.org/abs/1802.05957>`_ .
+
+    Parameters:
+        layer(Layer): Layer of paddle, which has weight.
+        name(str, optional): Name of the weight parameter. Default: 'weight'.
+        n_power_iterations(int, optional): The number of power iterations to calculate spectral norm. Default: 1.
+        eps(float, optional): The epsilon for numerical stability in calculating norms. Default: 1e-12.
+        dim(int, optional): The index of dimension which should be permuted to the first before reshaping Input(Weight) to matrix, it should be set as 0 if Input(Weight) is the weight of fc layer, and should be set as 1 if Input(Weight) is the weight of conv layer. Default: None.
+        
+    Returns:
+        The original layer with the spectral norm hook
+
+    Examples:
+       .. code-block:: python
+
+            from paddle.nn import Conv2D
+            from paddle.nn.utils import Spectralnorm
+
+            conv = Conv2D(3, 1, 3)
+            sn_conv = spectral_norm(conv)
+            print(sn_conv)
+            # Conv2D(3, 1, kernel_size=[3, 3], data_format=NCHW)
+            print(sn_conv.weight)
+            # Tensor(shape=[1, 3, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
+            #        [[[[-0.21090528,  0.18563725, -0.14127982],
+            #           [-0.02310637,  0.03197737,  0.34353802],
+            #           [-0.17117859,  0.33152047, -0.28408015]],
+            # 
+            #          [[-0.13336606, -0.01862637,  0.06959272],
+            #           [-0.02236020, -0.27091628, -0.24532901],
+            #           [ 0.27254242,  0.15516677,  0.09036587]],
+            # 
+            #          [[ 0.30169338, -0.28146112, -0.11768346],
+            #           [-0.45765871, -0.12504843, -0.17482486],
+            #           [-0.36866254, -0.19969313,  0.08783543]]]])
+
+    """
+
+    if dim is None:
+        if isinstance(layer, (Conv1DTranspose, Conv2DTranspose, Conv3DTranspose,
+                              Linear)):
+            dim = 1
+        else:
+            dim = 0
+    SpectralNorm.apply(layer, name, n_power_iterations, dim, eps)
+    return layer
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index fdf7a1b5bb2e2d..8d2cc8062d2ccb 100755
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -19,7 +19,7 @@
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.data_feeder import check_variable_and_dtype
 
-__all__ = ['weight_norm', 'remove_weight_norm']
+__all__ = []
 
 
 def l2_norm(x, axis, epsilon=1e-12, name=None):
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
index edebfdfcf37100..07d2935bc76466 100644
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -12,19 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = [
-    'Optimizer', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'RMSProp', 'Adadelta',
-    'SGD', 'Momentum', 'Lamb', 'lr'
-]
+from .optimizer import Optimizer  # noqa: F401
+from .adagrad import Adagrad  # noqa: F401
+from .adam import Adam  # noqa: F401
+from .adamw import AdamW  # noqa: F401
+from .adamax import Adamax  # noqa: F401
+from .rmsprop import RMSProp  # noqa: F401
+from .adadelta import Adadelta  # noqa: F401
+from .sgd import SGD  # noqa: F401
+from .momentum import Momentum  # noqa: F401
+from .lamb import Lamb  # noqa: F401
+from . import lr  # noqa: F401
 
-from .optimizer import Optimizer
-from .adagrad import Adagrad
-from .adam import Adam
-from .adamw import AdamW
-from .adamax import Adamax
-from .rmsprop import RMSProp
-from .adadelta import Adadelta
-from .sgd import SGD
-from .momentum import Momentum
-from .lamb import Lamb
-from . import lr
+__all__ = [     #noqa
+           'Optimizer',
+           'Adagrad',
+           'Adam',
+           'AdamW',
+           'Adamax',
+           'RMSProp',
+           'Adadelta',
+           'SGD',
+           'Momentum',
+           'Lamb'
+]
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index e921eda41cfb6a..6c10d9bc2690a0 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -17,7 +17,7 @@
 from ..fluid import framework
 from ..fluid.framework import Variable, name_scope
 
-__all__ = ["Adadelta"]
+__all__ = []
 
 
 class Adadelta(Optimizer):
@@ -38,20 +38,20 @@ class Adadelta(Optimizer):
         E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2
 
     Args:
-	learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
             It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
         epsilon (float): a small float number for numeric stability. Default 1.0e-6.
         rho (float): a floating point value indicating the decay rate. Default 0.95.
-        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-        It canbe a float value as coeff of L2 regularization or \
-        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-        the regularization setting here in optimizer will be ignored for this parameter. \
-        Otherwise, the regularization setting here in optimizer will take effect. \
-        Default None, meaning there is no regularization. 
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization. 
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
             ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py
index ec14828e693ee6..bb934e5a9262c7 100644
--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
@@ -17,7 +17,7 @@
 from ..fluid import framework
 from ..fluid.framework import Variable
 
-__all__ = ["Adagrad"]
+__all__ = []
 
 
 class Adagrad(Optimizer):
@@ -45,16 +45,16 @@ class Adagrad(Optimizer):
             It can be a float value or a ``Variable`` with a float type.
         epsilon (float, optional): A small float value for numerical stability.
             The default value is 1e-06.
-	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
-	    This parameter is required in dygraph mode. \
-	    The default value is None in static mode, at this time all parameters will be updated.
-	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-	    It canbe a float value as coeff of L2 regularization or \
-	    :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
-	    If a parameter has set regularizer using :ref:`api_paddle_fluid_param_attr_aramAttr` already, \
-	    the regularization setting here in optimizer will be ignored for this parameter. \
-	    Otherwise, the regularization setting here in optimizer will take effect. \
-	    Default None, meaning there is no regularization.
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_paddle_fluid_param_attr_aramAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
             some derived class of ``GradientClipBase`` . There are three cliping strategies, 
             ClipGradByGlobalNorm, ClipGradByNorm and ClipGradByValue. Default None, 
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 0cafbda893dd2f..63ca462d1a26b8 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -24,7 +24,7 @@
 
 import paddle
 
-__all__ = ["Adam"]
+__all__ = []
 
 
 class Adam(Optimizer):
@@ -60,18 +60,19 @@ class Adam(Optimizer):
         beta2 (float|Tensor, optional): The exponential decay rate for the 2nd moment estimates.
             It should be a float number or a Tensor with shape [1] and data type as float32.
             The default value is 0.999.
-        epsilon (float, optional): A small float value for numerical stability.
+        epsilon (float|Tensor, optional): A small float value for numerical stability.
+            It should be a float number or a Tensor with shape [1] and data type as float32.
             The default value is 1e-08.
-	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
-	    This parameter is required in dygraph mode. \
-	    The default value is None in static mode, at this time all parameters will be updated.
-	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-	    It canbe a float value as coeff of L2 regularization or \
-	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-	    the regularization setting here in optimizer will be ignored for this parameter. \
-	    Otherwise, the regularization setting here in optimizer will take effect. \
-	    Default None, meaning there is no regularization.
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
             ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
@@ -146,12 +147,18 @@ def __init__(self,
         assert beta1 is not None
         assert beta2 is not None
         assert epsilon is not None
-        if not 0 <= beta1 < 1:
-            raise ValueError("Invaild value of beta1, expect beta1 in [0,1).")
-        if not 0 <= beta2 < 1:
-            raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
-        if not 0 <= epsilon:
-            raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
+        if not isinstance(beta1, Variable):
+            if not 0 <= beta1 < 1:
+                raise ValueError(
+                    "Invaild value of beta1, expect beta1 in [0,1).")
+        if not isinstance(beta2, Variable):
+            if not 0 <= beta2 < 1:
+                raise ValueError(
+                    "Invaild value of beta2, expect beta2 in [0,1).")
+        if not isinstance(epsilon, Variable):
+            if not 0 <= epsilon:
+                raise ValueError(
+                    "Invaild value of epsilon, expect epsilon >= 0.")
         super(Adam, self).__init__(
             learning_rate=learning_rate,
             parameters=parameters,
@@ -297,7 +304,6 @@ def _append_optimize_op(self, block, param_and_grad):
             "Beta2PowOut": [beta2_pow_acc],
         }
         attrs = {
-            "epsilon": self._epsilon,
             "lazy_mode": self._lazy_mode,
             "min_row_size_to_use_multithread": 1000,
             "multi_precision": find_master
@@ -311,6 +317,10 @@ def _append_optimize_op(self, block, param_and_grad):
             inputs['Beta2Tensor'] = self._beta2
         else:
             attrs['beta2'] = self._beta2
+        if isinstance(self._epsilon, Variable):
+            inputs['EpsilonTensor'] = self._epsilon
+        else:
+            attrs['epsilon'] = self._epsilon
 
         if find_master:
             inputs["MasterParam"] = master_weight
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index 4a6c2278a46f40..44ae89f49d1c05 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -17,7 +17,7 @@
 from ..fluid import framework
 from ..fluid.framework import Variable, name_scope
 
-__all__ = ["Adamax"]
+__all__ = []
 
 
 class Adamax(Optimizer):
@@ -55,16 +55,16 @@ class Adamax(Optimizer):
             The default value is 0.999.
         epsilon (float, optional): A small float value for numerical stability.
             The default value is 1e-08.
-	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
-	    This parameter is required in dygraph mode. \
-	    The default value is None in static mode, at this time all parameters will be updated.
-	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-	    It canbe a float value as coeff of L2 regularization or \
-	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-	    the regularization setting here in optimizer will be ignored for this parameter. \
-	    Otherwise, the regularization setting here in optimizer will take effect. \
-	    Default None, meaning there is no regularization.
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
             some derived class of ``GradientClipBase`` . There are three cliping strategies 
             ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 78c9fcb83fc249..304f0b771826c9 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -19,7 +19,7 @@
 from ..fluid.dygraph import base as imperative_base
 import paddle
 
-__all__ = ['AdamW']
+__all__ = []
 
 
 class AdamW(Adam):
@@ -45,9 +45,9 @@ class AdamW(Adam):
     Args:
         learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
             It can be a float value or a LRScheduler. The default value is 0.001.
-	parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
-	    This parameter is required in dygraph mode. \
-	    The default value is None in static mode, at this time all parameters will be updated.
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
         beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
             It should be a float number or a Tensor with shape [1] and data type as float32.
             The default value is 0.9.
@@ -59,7 +59,7 @@ class AdamW(Adam):
         weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
         apply_decay_param_fun (function|None, optional): If it is not None,
             only tensors that makes apply_decay_param_fun(Tensor.name)==True
-            will be updated. It only works when we want to specify tensors.
+            will be updated with weight decay. It only works when we want to specify tensors.
             Default: None.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
diff --git a/python/paddle/optimizer/lamb.py b/python/paddle/optimizer/lamb.py
index a692f59de5b5fe..bff24e71c81536 100644
--- a/python/paddle/optimizer/lamb.py
+++ b/python/paddle/optimizer/lamb.py
@@ -17,7 +17,7 @@
 from ..fluid import framework
 from ..fluid.framework import Variable
 
-__all__ = ["Lamb"]
+__all__ = []
 
 
 class Lamb(Optimizer):
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 484b4fb7246a76..7da933a9b72798 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -17,10 +17,19 @@
 import warnings
 from paddle import Tensor
 
-__all__ = [
-    'LRScheduler', 'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay',
-    'InverseTimeDecay', 'PolynomialDecay', 'LinearWarmup', 'ExponentialDecay',
-    'MultiStepDecay', 'StepDecay', 'LambdaDecay', 'ReduceOnPlateau',
+__all__ = [ #noqa
+    'LRScheduler',
+    'NoamDecay',
+    'PiecewiseDecay',
+    'NaturalExpDecay',
+    'InverseTimeDecay',
+    'PolynomialDecay',
+    'LinearWarmup',
+    'ExponentialDecay',
+    'MultiStepDecay',
+    'StepDecay',
+    'LambdaDecay',
+    'ReduceOnPlateau',
     'CosineAnnealingDecay'
 ]
 
@@ -303,8 +312,8 @@ class PiecewiseDecay(LRScheduler):
             learning_rate = 0.1
 
     Args:
-        boundaries(list): A list of steps numbers. The type of element in the list is python int. 
-        values(list): A list of learning rate values that will be picked during different epoch boundaries. 
+        boundaries(list|tuple): A list/tuple of steps numbers. The type of element in the list is python int. 
+        values(list|tuple): A list/tuple of learning rate values that will be picked during different epoch boundaries. 
             The type of element in the list is python float.
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index 111b2720c86687..372143553e0c39 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -21,7 +21,8 @@
 from ..fluid import layers
 import paddle.fluid as fluid
 from paddle.fluid.regularizer import L2DecayRegularizer
-__all__ = ["Momentum"]
+
+__all__ = []
 
 
 class Momentum(Optimizer):
@@ -50,16 +51,16 @@ class Momentum(Optimizer):
         learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
             It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
         momentum (float): Momentum factor. The default value is 0.9.
-        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-        It canbe a float value as coeff of L2 regularization or \
-        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-        the regularization setting here in optimizer will be ignored for this parameter. \
-        Otherwise, the regularization setting here in optimizer will take effect. \
-        Default None, meaning there is no regularization.
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
             ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index b37d1726064113..b06bd2a2b0be95 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -42,7 +42,7 @@
 from .. import compat as cpt
 from .lr import LRScheduler
 
-__all__ = ['Optimizer']
+__all__ = []
 
 
 class Optimizer(object):
@@ -55,7 +55,7 @@ class Optimizer(object):
     Args:
         learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
             It can be a float value or any subclass of ``LRScheduler`` .
-        parameters (list, optional): List of ``Tensor`` names to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
@@ -100,8 +100,19 @@ def __init__(self,
                  weight_decay=None,
                  grad_clip=None,
                  name=None):
-        self._parameter_list = list(
-            parameters) if parameters is not None else None
+        if parameters is not None:
+            # paddle.Tensor is also iterable, so here we don't check whether
+            # the input is iterable, if the input is paddle.Tensor, the
+            # list(paddle.Tensor) will be a error value
+            if isinstance(parameters, paddle.Tensor):
+                raise TypeError(
+                    "`parameters` argument given to the optimizer should be "
+                    "an iterable of paddle Tensors, but got argument type is `{}`.".
+                    format(type(parameters)))
+            self._parameter_list = list(parameters)
+        else:
+            self._parameter_list = None
+
         self._name = name
         if framework.in_dygraph_mode():
             if self._parameter_list is None:
@@ -110,7 +121,8 @@ def __init__(self,
                 )
             if weight_decay is not None:
                 for param in self._parameter_list:
-                    if param.regularizer is not None:
+                    if hasattr(param,
+                               'regularizer') and param.regularizer is not None:
                         logging.info(
                             "If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
                             "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
@@ -433,17 +445,20 @@ def _append_optimize_op(self, block, param_and_grad):
     def _create_param_lr(self, param_and_grad):
         # create learning rate tensor for every parameter
         param = param_and_grad[0]
-        param_lr = param.optimize_attr['learning_rate']
-        if type(param_lr) == Variable:
-            return param_lr
-        else:
-            if param_lr == 1.0:
-                return self._global_learning_rate()
+        if hasattr(param, 'optimize_attr'):
+            param_lr = param.optimize_attr['learning_rate']
+            if type(param_lr) == Variable:
+                return param_lr
             else:
-                with default_main_program()._lr_schedule_guard(
-                        is_with_opt=True), framework.name_scope(
-                            'scale_with_param_lr'):
-                    return self._global_learning_rate() * param_lr
+                if param_lr == 1.0:
+                    return self._global_learning_rate()
+                else:
+                    with default_main_program()._lr_schedule_guard(
+                            is_with_opt=True), framework.name_scope(
+                                'scale_with_param_lr'):
+                        return self._global_learning_rate() * param_lr
+        else:
+            return self._global_learning_rate()
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 12825bb7813812..b0bb0228c8ca82 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -17,7 +17,7 @@
 from ..fluid import framework
 from ..fluid.framework import Variable
 
-__all__ = ["RMSProp"]
+__all__ = []
 
 
 class RMSProp(Optimizer):
@@ -80,16 +80,16 @@ class RMSProp(Optimizer):
             the gradient; if False, by the uncentered second moment. Setting this to
             True may help with training, but is slightly more expensive in terms of
             computation and memory. Defaults to False.
-	parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
-	    This parameter is required in dygraph mode. \
-	    The default value is None in static mode, at this time all parameters will be updated.
-	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-	    It canbe a float value as coeff of L2 regularization or \
-	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-	    the regularization setting here in optimizer will be ignored for this parameter. \
-	    Otherwise, the regularization setting here in optimizer will take effect. \
-	    Default None, meaning there is no regularization.
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
             ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index b2937ff1620643..4526034b405b0c 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -17,7 +17,8 @@
 from ..fluid import framework
 from ..fluid.framework import Variable, name_scope
 from ..fluid.dygraph import no_grad
-__all__ = ["SGD"]
+
+__all__ = []
 
 
 class SGD(Optimizer):
@@ -31,16 +32,16 @@ class SGD(Optimizer):
     Parameters:
         learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
             It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
-        parameters (list, optional): List of ``Tensor`` to update to minimize ``loss``. \
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
             This parameter is required in dygraph mode. \
             The default value is None in static mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-        It canbe a float value as coeff of L2 regularization or \
-        :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-        If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-        the regularization setting here in optimizer will be ignored for this parameter. \
-        Otherwise, the regularization setting here in optimizer will take effect. \
-        Default None, meaning there is no regularization.
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
             ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
diff --git a/python/paddle/proto/__init__.py b/python/paddle/proto/__init__.py
index 07406a841ec90a..f482d80548de13 100644
--- a/python/paddle/proto/__init__.py
+++ b/python/paddle/proto/__init__.py
@@ -14,3 +14,5 @@
 
 from paddle.proto.TrainerConfig_pb2 import OptimizationConfig, TrainerConfig
 from paddle.proto.ModelConfig_pb2 import ModelConfig
+
+__all__ = []
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
index 1a4d45469235da..9002cd0676edaa 100644
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -63,7 +63,15 @@ def reader():
 
 """
 
-import paddle.reader.decorator
-from paddle.reader.decorator import *
+from paddle.reader.decorator import map_readers  # noqa: F401
+from paddle.reader.decorator import shuffle  # noqa: F401
+from paddle.reader.decorator import xmap_readers  # noqa: F401
+from paddle.reader.decorator import firstn  # noqa: F401
+from paddle.reader.decorator import buffered  # noqa: F401
+from paddle.reader.decorator import compose  # noqa: F401
+from paddle.reader.decorator import cache  # noqa: F401
+from paddle.reader.decorator import ComposeNotAligned  # noqa: F401
+from paddle.reader.decorator import chain  # noqa: F401
+from paddle.reader.decorator import multiprocess_reader  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 8ee4d73ea847ea..3129029d829209 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -12,11 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = [
-    'cache', 'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn', 'xmap_readers', 'multiprocess_reader'
-]
-
 from threading import Thread
 import subprocess
 import multiprocessing
@@ -32,6 +27,8 @@
 import zlib
 import paddle.compat as cpt
 
+__all__ = []
+
 # On macOS, the 'spawn' start method is now the default in Python3.8 multiprocessing,
 # Paddle is currently unable to solve this, so forces the process to start using 
 # the 'fork' start method.
@@ -593,7 +590,8 @@ def _impl():
         sys.stderr.write("import ujson error: " + str(e) + " use json\n")
         import json
 
-    assert type(readers) is list and len(readers) > 0
+    assert isinstance(readers, (list, tuple)) and len(readers) > 0, (
+        "`readers` must be list or tuple.")
 
     def _read_into_queue(reader, queue):
         try:
diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py
index e15702e39c458e..e11600a06fb9e9 100644
--- a/python/paddle/reader/tests/decorator_test.py
+++ b/python/paddle/reader/tests/decorator_test.py
@@ -19,6 +19,8 @@
 
 import paddle.reader
 
+__all__ = []
+
 
 def reader_creator_10(dur):
     def reader():
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 91b4a29cefcc1c..89da75ae91e40e 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -12,88 +12,83 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# TODO: import framework api under this directory 
-__all__ = [
-    'append_backward',
-    'gradients',
-    'Executor',
-    'global_scope',
-    'scope_guard',
-    'BuildStrategy',
-    'CompiledProgram',
-    'Print',
-    'py_func',
-    'ExecutionStrategy',
-    'name_scope',
-    'ParallelExecutor',
-    'program_guard',
-    'WeightNormParamAttr',
-    'default_main_program',
-    'default_startup_program',
-    'Program',
-    'data',
-    'InputSpec',
-    'save',
-    'load',
-    'save_inference_model',
-    'load_inference_model',
-    'load_program_state',
-    'set_program_state',
-    'cpu_places',
-    'cuda_places',
-    'xpu_places',
-    'Variable',
-    'load_vars',
-    'save_vars',
-    'auc',
-    'accuracy',
-]
+from . import amp  # noqa: F401
+from . import nn  # noqa: F401
+from .io import save_inference_model  # noqa: F401
+from .io import load_inference_model  # noqa: F401
+from .io import deserialize_persistables  # noqa: F401
+from .io import serialize_persistables  # noqa: F401
+from .io import deserialize_program  # noqa: F401
+from .io import serialize_program  # noqa: F401
+from .io import load_from_file  # noqa: F401
+from .io import save_to_file  # noqa: F401
+from .io import normalize_program  # noqa: F401
+from ..fluid import Scope  # noqa: F401
+from .input import data  # noqa: F401
+from .input import InputSpec  # noqa: F401
+from ..fluid.executor import Executor  # noqa: F401
+from ..fluid.executor import global_scope  # noqa: F401
+from ..fluid.executor import scope_guard  # noqa: F401
+from ..fluid.backward import append_backward  # noqa: F401
+from ..fluid.backward import gradients  # noqa: F401
+from ..fluid.compiler import BuildStrategy  # noqa: F401
+from ..fluid.compiler import CompiledProgram  # noqa: F401
+from ..fluid.compiler import ExecutionStrategy  # noqa: F401
+from ..fluid.framework import default_main_program  # noqa: F401
+from ..fluid.framework import default_startup_program  # noqa: F401
+from ..fluid.framework import device_guard  # noqa: F401
+from ..fluid.framework import Program  # noqa: F401
+from ..fluid.framework import name_scope  # noqa: F401
+from ..fluid.framework import program_guard  # noqa: F401
+from ..fluid.framework import cpu_places  # noqa: F401
+from ..fluid.framework import cuda_places  # noqa: F401
+from ..fluid.framework import xpu_places  # noqa: F401
+from ..fluid.framework import Variable  # noqa: F401
+from ..fluid.layers.control_flow import Print  # noqa: F401
+from ..fluid.layers.nn import py_func  # noqa: F401
+from ..fluid.parallel_executor import ParallelExecutor  # noqa: F401
+from ..fluid.param_attr import WeightNormParamAttr  # noqa: F401
+from ..fluid.io import save  # noqa: F401
+from ..fluid.io import load  # noqa: F401
+from ..fluid.io import load_program_state  # noqa: F401
+from ..fluid.io import set_program_state  # noqa: F401
 
-from . import nn
-from . import amp
-from .io import save_inference_model  #DEFINE_ALIAS
-from .io import load_inference_model  #DEFINE_ALIAS
-from .io import deserialize_persistables  #DEFINE_ALIAS
-from .io import serialize_persistables  #DEFINE_ALIAS
-from .io import deserialize_program  #DEFINE_ALIAS
-from .io import serialize_program  #DEFINE_ALIAS
-from .io import load_from_file  #DEFINE_ALIAS
-from .io import save_to_file  #DEFINE_ALIAS
-from .io import normalize_program  #DEFINE_ALIAS
-from ..fluid import Scope  #DEFINE_ALIAS
-from .input import data  #DEFINE_ALIAS
-from .input import InputSpec  #DEFINE_ALIAS
-from ..fluid.executor import Executor  #DEFINE_ALIAS
-from ..fluid.executor import global_scope  #DEFINE_ALIAS
-from ..fluid.executor import scope_guard  #DEFINE_ALIAS
-from ..fluid.backward import append_backward  #DEFINE_ALIAS
-from ..fluid.backward import gradients  #DEFINE_ALIAS
-from ..fluid.compiler import BuildStrategy  #DEFINE_ALIAS
-from ..fluid.compiler import CompiledProgram  #DEFINE_ALIAS
-from ..fluid.compiler import ExecutionStrategy  #DEFINE_ALIAS
-from ..fluid.framework import default_main_program  #DEFINE_ALIAS
-from ..fluid.framework import default_startup_program  #DEFINE_ALIAS
-from ..fluid.framework import device_guard  #DEFINE_ALIAS
-from ..fluid.framework import Program  #DEFINE_ALIAS
-from ..fluid.framework import name_scope  #DEFINE_ALIAS
-from ..fluid.framework import program_guard  #DEFINE_ALIAS
-from ..fluid.framework import cpu_places  #DEFINE_ALIAS
-from ..fluid.framework import cuda_places  #DEFINE_ALIAS
-from ..fluid.framework import xpu_places  #DEFINE_ALIAS
-from ..fluid.framework import Variable  #DEFINE_ALIAS
-from ..fluid.layers.control_flow import Print  #DEFINE_ALIAS
-from ..fluid.layers.nn import py_func  #DEFINE_ALIAS
-from ..fluid.parallel_executor import ParallelExecutor  #DEFINE_ALIAS
-from ..fluid.param_attr import WeightNormParamAttr  #DEFINE_ALIAS
-from ..fluid.io import save  #DEFINE_ALIAS
-from ..fluid.io import load  #DEFINE_ALIAS
-from ..fluid.io import load_program_state  #DEFINE_ALIAS
-from ..fluid.io import set_program_state  #DEFINE_ALIAS
+from ..fluid.io import load_vars  # noqa: F401
+from ..fluid.io import save_vars  # noqa: F401
 
-from ..fluid.io import load_vars  #DEFINE_ALIAS
-from ..fluid.io import save_vars  #DEFINE_ALIAS
+from ..fluid.layers import create_parameter  # noqa: F401
+from ..fluid.layers import create_global_var  # noqa: F401
+from ..fluid.layers.metric_op import auc  # noqa: F401
+from ..fluid.layers.metric_op import accuracy  # noqa: F401
 
-from ..fluid.layers import create_parameter  #DEFINE_ALIAS
-from ..fluid.layers import create_global_var  #DEFINE_ALIAS
-from ..fluid.layers.metric_op import auc  #DEFINE_ALIAS
-from ..fluid.layers.metric_op import accuracy  #DEFINE_ALIAS
+__all__ = [     #noqa
+           'append_backward',
+           'gradients',
+           'Executor',
+           'global_scope',
+           'scope_guard',
+           'BuildStrategy',
+           'CompiledProgram',
+           'Print',
+           'py_func',
+           'ExecutionStrategy',
+           'name_scope',
+           'ParallelExecutor',
+           'program_guard',
+           'WeightNormParamAttr',
+           'default_main_program',
+           'default_startup_program',
+           'Program',
+           'data',
+           'InputSpec',
+           'save',
+           'load',
+           'save_inference_model',
+           'load_inference_model',
+           'load_program_state',
+           'set_program_state',
+           'cpu_places',
+           'cuda_places',
+           'Variable',
+           'create_global_var'
+]
diff --git a/python/paddle/static/amp/__init__.py b/python/paddle/static/amp/__init__.py
index bfc1beed552975..8ee3225057d0a5 100644
--- a/python/paddle/static/amp/__init__.py
+++ b/python/paddle/static/amp/__init__.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.contrib import mixed_precision
-from ...fluid.contrib.mixed_precision import *
-from ...fluid.contrib.mixed_precision import bf16
-from ...fluid.contrib.mixed_precision.bf16 import *
-
-__all__ = mixed_precision.__all__
-__all__ += bf16.__all__
+from ...fluid.contrib.mixed_precision import decorate  # noqa: F401
+from ...fluid.contrib.mixed_precision import CustomOpLists  # noqa: F401
+from ...fluid.contrib.mixed_precision import AutoMixedPrecisionLists  # noqa: F401
+from ...fluid.contrib.mixed_precision import fp16_guard  # noqa: F401
+from ...fluid.contrib.mixed_precision import cast_model_to_fp16  # noqa: F401
+from ...fluid.contrib.mixed_precision import cast_parameters_to_fp16  # noqa: F401
+from ...fluid.contrib.mixed_precision import bf16  # noqa: F401
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index f05051d3e68281..f06c45cc369737 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -21,7 +21,7 @@
 from paddle.fluid.framework import convert_np_dtype_to_dtype_
 from paddle.fluid.framework import static_only
 
-__all__ = ['data', 'InputSpec']
+__all__ = []
 
 
 @static_only
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index 6bbab6ed672ca1..58e8ebc481d799 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -37,17 +37,7 @@
 from paddle.fluid.executor import Executor, global_scope
 from paddle.fluid.log_helper import get_logger
 
-__all__ = [
-    'save_inference_model',
-    'load_inference_model',
-    'serialize_program',
-    'serialize_persistables',
-    'save_to_file',
-    'deserialize_program',
-    'deserialize_persistables',
-    'load_from_file',
-    'normalize_program',
-]
+__all__ = []
 
 _logger = get_logger(
     __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index fd84a0a9284ee2..416f6e4f3df068 100644
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -12,7 +12,52 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = [
+from .common import fc  # noqa: F401
+from .common import deform_conv2d  # noqa: F401
+
+from ...fluid.layers import batch_norm  # noqa: F401
+from ...fluid.layers import bilinear_tensor_product  # noqa: F401
+from ...fluid.layers import case  # noqa: F401
+from ...fluid.layers import cond  # noqa: F401
+from ...fluid.layers import conv2d  # noqa: F401
+from ...fluid.layers import conv2d_transpose  # noqa: F401
+from ...fluid.layers import conv3d  # noqa: F401
+from ...fluid.layers import conv3d_transpose  # noqa: F401
+from ...fluid.layers import create_parameter  # noqa: F401
+from ...fluid.layers import crf_decoding  # noqa: F401
+from ...fluid.layers import data_norm  # noqa: F401
+from ...fluid.layers import group_norm  # noqa: F401
+from ...fluid.layers import instance_norm  # noqa: F401
+from ...fluid.layers import layer_norm  # noqa: F401
+from ...fluid.layers import multi_box_head  # noqa: F401
+from ...fluid.layers import nce  # noqa: F401
+from ...fluid.layers import prelu  # noqa: F401
+from ...fluid.layers import py_func  # noqa: F401
+from ...fluid.layers import row_conv  # noqa: F401
+from ...fluid.layers import spectral_norm  # noqa: F401
+from ...fluid.layers import switch_case  # noqa: F401
+from ...fluid.layers import while_loop  # noqa: F401
+
+from ...fluid.input import embedding  # noqa: F401
+from ...fluid.contrib.layers import sparse_embedding  # noqa: F401
+
+from ...fluid.layers.sequence_lod import sequence_conv  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_softmax  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_pool  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_concat  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_first_step  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_last_step  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_slice  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_expand  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_expand_as  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_pad  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_unpad  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_reshape  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_scatter  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_enumerate  # noqa: F401
+from ...fluid.layers.sequence_lod import sequence_reverse  # noqa: F401
+
+__all__ = [     #noqa
     'fc',
     'batch_norm',
     'embedding',
@@ -39,33 +84,19 @@
     'switch_case',
     'while_loop',
     'sparse_embedding',
+    'sequence_conv',
+    'sequence_softmax',
+    'sequence_pool',
+    'sequence_concat',
+    'sequence_first_step',
+    'sequence_last_step',
+    'sequence_slice',
+    'sequence_expand',
+    'sequence_expand_as',
+    'sequence_pad',
+    'sequence_unpad',
+    'sequence_reshape',
+    'sequence_scatter',
+    'sequence_enumerate',
+    'sequence_reverse',
 ]
-
-from .common import fc  #DEFINE_ALIAS
-from .common import deform_conv2d  #DEFINE_ALIAS
-
-from ...fluid.layers import batch_norm  #DEFINE_ALIAS
-from ...fluid.layers import bilinear_tensor_product  #DEFINE_ALIAS
-from ...fluid.layers import case  #DEFINE_ALIAS
-from ...fluid.layers import cond  #DEFINE_ALIAS
-from ...fluid.layers import conv2d  #DEFINE_ALIAS
-from ...fluid.layers import conv2d_transpose  #DEFINE_ALIAS
-from ...fluid.layers import conv3d  #DEFINE_ALIAS
-from ...fluid.layers import conv3d_transpose  #DEFINE_ALIAS
-from ...fluid.layers import create_parameter  #DEFINE_ALIAS
-from ...fluid.layers import crf_decoding  #DEFINE_ALIAS
-from ...fluid.layers import data_norm  #DEFINE_ALIAS
-from ...fluid.layers import group_norm  #DEFINE_ALIAS
-from ...fluid.layers import instance_norm  #DEFINE_ALIAS
-from ...fluid.layers import layer_norm  #DEFINE_ALIAS
-from ...fluid.layers import multi_box_head  #DEFINE_ALIAS
-from ...fluid.layers import nce  #DEFINE_ALIAS
-from ...fluid.layers import prelu  #DEFINE_ALIAS
-from ...fluid.layers import py_func  #DEFINE_ALIAS
-from ...fluid.layers import row_conv  #DEFINE_ALIAS
-from ...fluid.layers import spectral_norm  #DEFINE_ALIAS
-from ...fluid.layers import switch_case  #DEFINE_ALIAS
-from ...fluid.layers import while_loop  #DEFINE_ALIAS
-
-from ...fluid.input import embedding  #DEFINE_ALIAS
-from ...fluid.contrib.layers import sparse_embedding  #DEFINE_ALIAS
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index f917b4fa09a507..b8133872aa934c 100755
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -15,7 +15,7 @@
 import paddle
 from paddle.fluid.framework import static_only
 
-__all__ = ['fc', 'deform_conv2d']
+__all__ = []
 
 
 @static_only
@@ -88,7 +88,7 @@ def fc(x,
         out.shape = (1, 2)
 
     Args:
-        x (Tensor|list of Tensor): A tensor or a list of tensor. The number of dimensions
+        x (Tensor|list[Tensor]|tuple[Tensor]): A tensor or a list/tuple of tensors. The number of dimensions
             of each tensor is at least 2. The data type should be float16, float32 or float64.
         size (int): The number of output units in this layer, which also means the feature
             size of output tensor.
@@ -235,16 +235,16 @@ def deform_conv2d(x,
             deformable convolution v1.
         num_filters(int): The number of filter. It is as same as the output
             image channel.
-        filter_size (int|tuple): The filter size. If filter_size is a tuple,
+        filter_size (int|list|tuple): The filter size. If filter_size is a list/tuple,
             it must contain two integers, (filter_size_H, filter_size_W).
             Otherwise, the filter will be a square.
-        stride (int|tuple, Optional): The stride size. If stride is a tuple, it must
+        stride (int|list|tuple, Optional): The stride size. If stride is a list/tuple, it must
             contain two integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride. Default: stride = 1.
-        padding (int|tuple, Optional): The padding size. If padding is a tuple, it must
+        padding (int|list|tuple, Optional): The padding size. If padding is a list/tuple, it must
             contain two integers, (padding_H, padding_W). Otherwise, the
             padding_H = padding_W = padding. Default: padding = 0.
-        dilation (int|tuple, Optional): The dilation size. If dilation is a tuple, it must
+        dilation (int|list|tuple, Optional): The dilation size. If dilation is a list/tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: dilation = 1.
         groups (int, Optional): The groups number of the deformable conv layer. According to
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 0a75f6fd7babc3..c8d80fc9bc68cb 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -11,205 +11,339 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import print_function
 
-#from .math import *
-#from .creation import *
-#from .linalg import *
+from .attribute import rank  # noqa: F401
+from .attribute import shape  # noqa: F401
+from .attribute import real  # noqa: F401
+from .attribute import imag  # noqa: F401
+from .creation import to_tensor  # noqa: F401
+from .creation import diag  # noqa: F401
+from .creation import eye  # noqa: F401
+from .creation import linspace  # noqa: F401
+from .creation import ones  # noqa: F401
+from .creation import ones_like  # noqa: F401
+from .creation import zeros  # noqa: F401
+from .creation import zeros_like  # noqa: F401
+from .creation import arange  # noqa: F401
+from .creation import eye  # noqa: F401
+from .creation import full  # noqa: F401
+from .creation import full_like  # noqa: F401
+from .creation import triu  # noqa: F401
+from .creation import tril  # noqa: F401
+from .creation import meshgrid  # noqa: F401
+from .creation import empty  # noqa: F401
+from .creation import empty_like  # noqa: F401
+from .linalg import matmul  # noqa: F401
+from .linalg import dot  # noqa: F401
+from .linalg import norm  # noqa: F401
+from .linalg import transpose  # noqa: F401
+from .linalg import dist  # noqa: F401
+from .linalg import t  # noqa: F401
+from .linalg import cross  # noqa: F401
+from .linalg import cholesky  # noqa: F401
+from .linalg import bmm  # noqa: F401
+from .linalg import histogram  # noqa: F401
+from .linalg import mv  # noqa: F401
+from .logic import equal  # noqa: F401
+from .logic import greater_equal  # noqa: F401
+from .logic import greater_than  # noqa: F401
+from .logic import is_empty  # noqa: F401
+from .logic import less_equal  # noqa: F401
+from .logic import less_than  # noqa: F401
+from .logic import logical_and  # noqa: F401
+from .logic import logical_not  # noqa: F401
+from .logic import logical_or  # noqa: F401
+from .logic import logical_xor  # noqa: F401
+from .logic import not_equal  # noqa: F401
+from .logic import allclose  # noqa: F401
+from .logic import equal_all  # noqa: F401
+from .logic import is_tensor  # noqa: F401
+from .manipulation import cast  # noqa: F401
+from .manipulation import concat  # noqa: F401
+from .manipulation import expand  # noqa: F401
+from .manipulation import broadcast_to  # noqa: F401
+from .manipulation import expand_as  # noqa: F401
+from .manipulation import tile  # noqa: F401
+from .manipulation import flatten  # noqa: F401
+from .manipulation import flatten_  # noqa: F401
+from .manipulation import gather  # noqa: F401
+from .manipulation import gather_nd  # noqa: F401
+from .manipulation import reshape  # noqa: F401
+from .manipulation import reshape_  # noqa: F401
+from .manipulation import flip as reverse  # noqa: F401
+from .manipulation import scatter  # noqa: F401
+from .manipulation import scatter_  # noqa: F401
+from .manipulation import scatter_nd_add  # noqa: F401
+from .manipulation import scatter_nd  # noqa: F401
+from .manipulation import shard_index  # noqa: F401
+from .manipulation import slice  # noqa: F401
+from .manipulation import split  # noqa: F401
+from .manipulation import squeeze  # noqa: F401
+from .manipulation import squeeze_  # noqa: F401
+from .manipulation import stack  # noqa: F401
+from .manipulation import strided_slice  # noqa: F401
+from .manipulation import transpose  # noqa: F401
+from .manipulation import unique  # noqa: F401
+from .manipulation import unsqueeze  # noqa: F401
+from .manipulation import unsqueeze_  # noqa: F401
+from .manipulation import unstack  # noqa: F401
+from .manipulation import flip  # noqa: F401
+from .manipulation import unbind  # noqa: F401
+from .manipulation import roll  # noqa: F401
+from .manipulation import chunk  # noqa: F401
+from .math import abs  # noqa: F401
+from .math import acos  # noqa: F401
+from .math import asin  # noqa: F401
+from .math import atan  # noqa: F401
+from .math import ceil  # noqa: F401
+from .math import ceil_  # noqa: F401
+from .math import cos  # noqa: F401
+from .math import tan  # noqa: F401
+from .math import cosh  # noqa: F401
+from .math import cumsum  # noqa: F401
+from .math import exp  # noqa: F401
+from .math import exp_  # noqa: F401
+from .math import floor  # noqa: F401
+from .math import floor_  # noqa: F401
+from .math import increment  # noqa: F401
+from .math import log  # noqa: F401
+from .math import multiplex  # noqa: F401
+from .math import pow  # noqa: F401
+from .math import reciprocal  # noqa: F401
+from .math import reciprocal_  # noqa: F401
+from .math import round  # noqa: F401
+from .math import round_  # noqa: F401
+from .math import rsqrt  # noqa: F401
+from .math import rsqrt_  # noqa: F401
+from .math import scale  # noqa: F401
+from .math import scale_  # noqa: F401
+from .math import sign  # noqa: F401
+from .math import sin  # noqa: F401
+from .math import sinh  # noqa: F401
+from .math import sqrt  # noqa: F401
+from .math import sqrt_  # noqa: F401
+from .math import square  # noqa: F401
+from .math import stanh  # noqa: F401
+from .math import sum  # noqa: F401
+from .math import tanh  # noqa: F401
+from .math import tanh_  # noqa: F401
+from .math import add_n  # noqa: F401
+from .math import max  # noqa: F401
+from .math import maximum  # noqa: F401
+from .math import min  # noqa: F401
+from .math import minimum  # noqa: F401
+from .math import mm  # noqa: F401
+from .math import divide  # noqa: F401
+from .math import floor_divide  # noqa: F401
+from .math import remainder  # noqa: F401
+from .math import mod  # noqa: F401
+from .math import floor_mod  # noqa: F401
+from .math import multiply  # noqa: F401
+from .math import add  # noqa: F401
+from .math import add_  # noqa: F401
+from .math import subtract  # noqa: F401
+from .math import subtract_  # noqa: F401
+from .math import atan  # noqa: F401
+from .math import logsumexp  # noqa: F401
+from .math import inverse  # noqa: F401
+from .math import log2  # noqa: F401
+from .math import log10  # noqa: F401
+from .math import log1p  # noqa: F401
+from .math import erf  # noqa: F401
+from .math import addmm  # noqa: F401
+from .math import clip  # noqa: F401
+from .math import clip_  # noqa: F401
+from .math import trace  # noqa: F401
+from .math import kron  # noqa: F401
+from .math import isfinite  # noqa: F401
+from .math import isinf  # noqa: F401
+from .math import isnan  # noqa: F401
+from .math import prod  # noqa: F401
+from .math import all  # noqa: F401
+from .math import any  # noqa: F401
+from .math import broadcast_shape  # noqa: F401
+from .math import conj  # noqa: F401
 
-# TODO: define alias in tensor and framework directory
+from .random import multinomial  # noqa: F401
+from .random import standard_normal  # noqa: F401
+from .random import normal  # noqa: F401
+from .random import uniform  # noqa: F401
+from .random import randn  # noqa: F401
+from .random import rand  # noqa: F401
+from .random import randint  # noqa: F401
+from .random import randperm  # noqa: F401
+from .search import argmax  # noqa: F401
+from .search import argmin  # noqa: F401
+from .search import argsort  # noqa: F401
+from .search import topk  # noqa: F401
+from .search import where  # noqa: F401
+from .search import index_select  # noqa: F401
+from .search import nonzero  # noqa: F401
+from .search import sort  # noqa: F401
+from .search import index_sample  # noqa: F401
+from .search import masked_select  # noqa: F401
+from .stat import mean  # noqa: F401
+from .stat import std  # noqa: F401
+from .stat import var  # noqa: F401
+from .stat import numel  # noqa: F401
+from .stat import median  # noqa: F401
+from .to_string import set_printoptions  # noqa: F401
 
-from .random import randperm
-from .attribute import rank  #DEFINE_ALIAS
-from .attribute import shape  #DEFINE_ALIAS
-from .attribute import real  #DEFINE_ALIAS
-from .attribute import imag  #DEFINE_ALIAS
-from .creation import to_tensor  #DEFINE_ALIAS
-from .creation import diag  #DEFINE_ALIAS
-from .creation import eye  #DEFINE_ALIAS
-# from .creation import fill_constant  #DEFINE_ALIAS
-# from .creation import get_tensor_from_selected_rows        #DEFINE_ALIAS
-from .creation import linspace  #DEFINE_ALIAS
-from .creation import ones  #DEFINE_ALIAS
-from .creation import ones_like  #DEFINE_ALIAS
-from .creation import zeros  #DEFINE_ALIAS
-from .creation import zeros_like  #DEFINE_ALIAS
-from .creation import arange  #DEFINE_ALIAS
-from .creation import eye  #DEFINE_ALIAS
-from .creation import full  #DEFINE_ALIAS
-from .creation import full_like  #DEFINE_ALIAS
-from .creation import triu  #DEFINE_ALIAS
-from .creation import tril  #DEFINE_ALIAS
-from .creation import meshgrid  #DEFINE_ALIAS
-from .creation import empty  #DEFINE_ALIAS
-from .creation import empty_like  #DEFINE_ALIAS
-from .linalg import matmul  #DEFINE_ALIAS
-from .linalg import dot  #DEFINE_ALIAS
-# from .linalg import einsum        #DEFINE_ALIAS
-from .linalg import norm  #DEFINE_ALIAS
-from .linalg import transpose  #DEFINE_ALIAS
-from .linalg import dist  #DEFINE_ALIAS
-from .linalg import t  #DEFINE_ALIAS
-from .linalg import cross  #DEFINE_ALIAS
-from .linalg import cholesky  #DEFINE_ALIAS
-# from .linalg import tensordot        #DEFINE_ALIAS
-from .linalg import bmm  #DEFINE_ALIAS
-from .linalg import histogram  #DEFINE_ALIAS
-from .linalg import mv  #DEFINE_ALIAS
-from .logic import equal  #DEFINE_ALIAS
-from .logic import greater_equal  #DEFINE_ALIAS
-from .logic import greater_than  #DEFINE_ALIAS
-from .logic import is_empty  #DEFINE_ALIAS
-#from .logic import isfinite  #DEFINE_ALIAS
-from .logic import less_equal  #DEFINE_ALIAS
-from .logic import less_than  #DEFINE_ALIAS
-from .logic import logical_and  #DEFINE_ALIAS
-from .logic import logical_not  #DEFINE_ALIAS
-from .logic import logical_or  #DEFINE_ALIAS
-from .logic import logical_xor  #DEFINE_ALIAS
-from .logic import not_equal  #DEFINE_ALIAS
-from .logic import allclose  #DEFINE_ALIAS
-from .logic import equal_all  #DEFINE_ALIAS
-# from .logic import isnan        #DEFINE_ALIAS
-from .logic import is_tensor  #DEFINE_ALIAS
-from .manipulation import cast  #DEFINE_ALIAS
-from .manipulation import concat  #DEFINE_ALIAS
-from .manipulation import expand  #DEFINE_ALIAS
-from .manipulation import broadcast_to  #DEFINE_ALIAS
-from .manipulation import expand_as  #DEFINE_ALIAS
-from .manipulation import tile  #DEFINE_ALIAS
-from .manipulation import flatten  #DEFINE_ALIAS
-from .manipulation import gather  #DEFINE_ALIAS
-from .manipulation import gather_nd  #DEFINE_ALIAS
-from .manipulation import reshape  #DEFINE_ALIAS
-from .manipulation import reshape_  #DEFINE_ALIAS
-from .manipulation import flip as reverse  #DEFINE_ALIAS
-from .manipulation import scatter  #DEFINE_ALIAS
-from .manipulation import scatter_  #DEFINE_ALIAS
-from .manipulation import scatter_nd_add  #DEFINE_ALIAS
-from .manipulation import scatter_nd  #DEFINE_ALIAS
-from .manipulation import shard_index  #DEFINE_ALIAS
-from .manipulation import slice  #DEFINE_ALIAS
-from .manipulation import split  #DEFINE_ALIAS
-from .manipulation import squeeze  #DEFINE_ALIAS
-from .manipulation import squeeze_  #DEFINE_ALIAS
-from .manipulation import stack  #DEFINE_ALIAS
-from .manipulation import strided_slice  #DEFINE_ALIAS
-from .manipulation import transpose  #DEFINE_ALIAS
-from .manipulation import unique  #DEFINE_ALIAS
-from .manipulation import unsqueeze  #DEFINE_ALIAS
-from .manipulation import unsqueeze_  #DEFINE_ALIAS
-from .manipulation import unstack  #DEFINE_ALIAS
-from .manipulation import flip  #DEFINE_ALIAS
-from .manipulation import unbind  #DEFINE_ALIAS
-from .manipulation import roll  #DEFINE_ALIAS
-from .manipulation import chunk  #DEFINE_ALIAS
-from .math import abs  #DEFINE_ALIAS
-from .math import acos  #DEFINE_ALIAS
-from .math import asin  #DEFINE_ALIAS
-from .math import atan  #DEFINE_ALIAS
-from .math import ceil  #DEFINE_ALIAS
-from .math import cos  #DEFINE_ALIAS
-from .math import tan  #DEFINE_ALIAS
-from .math import cosh  #DEFINE_ALIAS
-from .math import cumsum  #DEFINE_ALIAS
-# from .math import elementwise_add  #DEFINE_ALIAS
-# from .math import elementwise_div  #DEFINE_ALIAS
-# from .math import elementwise_floordiv  #DEFINE_ALIAS
-# from .math import elementwise_mul  #DEFINE_ALIAS
-# from .math import elementwise_mod  #DEFINE_ALIAS
-# from .math import elementwise_pow  #DEFINE_ALIAS
-# from .math import elementwise_sub  #DEFINE_ALIAS
-from .math import exp  #DEFINE_ALIAS
-from .math import floor  #DEFINE_ALIAS
-from .math import increment  #DEFINE_ALIAS
-from .math import log  #DEFINE_ALIAS
-from .math import multiplex  #DEFINE_ALIAS
-from .math import pow  #DEFINE_ALIAS
-from .math import reciprocal  #DEFINE_ALIAS
-# from .math import reduce_max  #DEFINE_ALIAS
-# from .math import reduce_min  #DEFINE_ALIAS
-# from .math import reduce_prod  #DEFINE_ALIAS
-# from .math import reduce_sum  #DEFINE_ALIAS
-from .math import round  #DEFINE_ALIAS
-from .math import rsqrt  #DEFINE_ALIAS
-from .math import scale  #DEFINE_ALIAS
-from .math import sign  #DEFINE_ALIAS
-from .math import sin  #DEFINE_ALIAS
-from .math import sinh  #DEFINE_ALIAS
-from .math import sqrt  #DEFINE_ALIAS
-from .math import square  #DEFINE_ALIAS
-from .math import stanh  #DEFINE_ALIAS
-from .math import sum  #DEFINE_ALIAS
-from .math import tanh  #DEFINE_ALIAS
-from .math import tanh_  #DEFINE_ALIAS
-from .math import add_n  #DEFINE_ALIAS
-from .math import max  #DEFINE_ALIAS
-from .math import maximum  #DEFINE_ALIAS
-from .math import min  #DEFINE_ALIAS
-from .math import minimum  #DEFINE_ALIAS
-from .math import mm  #DEFINE_ALIAS
-from .math import divide  #DEFINE_ALIAS
-from .math import floor_divide  #DEFINE_ALIAS
-from .math import remainder  #DEFINE_ALIAS
-from .math import mod  #DEFINE_ALIAS
-from .math import floor_mod  #DEFINE_ALIAS
-from .math import multiply  #DEFINE_ALIAS
-from .math import add  #DEFINE_ALIAS
-from .math import subtract  #DEFINE_ALIAS
-from .math import atan  #DEFINE_ALIAS
-from .math import logsumexp  #DEFINE_ALIAS
-from .math import inverse  #DEFINE_ALIAS
-from .math import log2  #DEFINE_ALIAS
-from .math import log10  #DEFINE_ALIAS
-from .math import log1p  #DEFINE_ALIAS
-from .math import erf  #DEFINE_ALIAS
-from .math import addmm  #DEFINE_ALIAS
-from .math import clip  #DEFINE_ALIAS
-from .math import trace  #DEFINE_ALIAS
-from .math import kron  #DEFINE_ALIAS
-from .math import isfinite  #DEFINE_ALIAS
-from .math import isinf  #DEFINE_ALIAS
-from .math import isnan  #DEFINE_ALIAS
-from .math import prod  #DEFINE_ALIAS
-from .math import all  #DEFINE_ALIAS
-from .math import any  #DEFINE_ALIAS
-from .math import broadcast_shape  #DEFINE_ALIAS
-from .math import conj  #DEFINE_ALIAS
+from .array import array_length  # noqa: F401
+from .array import array_read  # noqa: F401
+from .array import array_write  # noqa: F401
+from .array import create_array  # noqa: F401
 
-from .random import multinomial  #DEFINE_ALIAS
-from .random import standard_normal
-from .random import normal
-from .random import uniform  #DEFINE_ALIAS
-from .random import randn  #DEFINE_ALIAS
-from .random import rand  #DEFINE_ALIAS
-from .random import randint  #DEFINE_ALIAS
-from .random import randperm  #DEFINE_ALIAS
-from .search import argmax  #DEFINE_ALIAS
-from .search import argmin  #DEFINE_ALIAS
-from .search import argsort  #DEFINE_ALIAS
-# from .search import has_inf  #DEFINE_ALIAS
-# from .search import has_nan  #DEFINE_ALIAS
-# from .search import masked_select        #DEFINE_ALIAS
-from .search import topk  #DEFINE_ALIAS
-from .search import where  #DEFINE_ALIAS
-from .search import index_select  #DEFINE_ALIAS
-from .search import nonzero  #DEFINE_ALIAS
-from .search import sort  #DEFINE_ALIAS
-from .search import index_sample  #DEFINE_ALIAS
-from .search import masked_select  #DEFINE_ALIAS
-from .stat import mean  #DEFINE_ALIAS
-# from .stat import reduce_mean  #DEFINE_ALIAS
-from .stat import std  #DEFINE_ALIAS
-from .stat import var  #DEFINE_ALIAS
-from .stat import numel  #DEFINE_ALIAS
-from .stat import median  #DEFINE_ALIAS
-# from .tensor import Tensor        #DEFINE_ALIAS
-# from .tensor import LoDTensor        #DEFINE_ALIAS
-# from .tensor import LoDTensorArray        #DEFINE_ALIAS
-from .to_string import set_printoptions  #DEFINE_ALIAS
-
-from .array import array_length  #DEFINE_ALIAS
-from .array import array_read  #DEFINE_ALIAS
-from .array import array_write  #DEFINE_ALIAS
-from .array import create_array  #DEFINE_ALIAS
+#this list used in math_op_patch.py for _binary_creator_
+tensor_method_func  = [ #noqa
+           'matmul',
+           'dot',
+           'norm',
+           'transpose',
+           'dist',
+           't',
+           'cross',
+           'cholesky',
+           'bmm',
+           'histogram',
+           'mv',
+           'abs',
+           'acos',
+           'all',
+           'any',
+           'asin',
+           'atan',
+           'ceil',
+           'ceil_',
+           'cos',
+           'cosh',
+           'cumsum',
+           'exp',
+           'exp_',
+           'floor',
+           'floor_',
+           'increment',
+           'log',
+           'log2',
+           'log10',
+           'logsumexp',
+           'mul',
+           'multiplex',
+           'pow',
+           'prod',
+           'reciprocal',
+           'reciprocal_',
+           'round',
+           'round_',
+           'rsqrt',
+           'rsqrt_',
+           'scale',
+           'scale_',
+           'sign',
+           'sin',
+           'sinh',
+           'sqrt',
+           'sqrt_',
+           'square',
+           'stanh',
+           'sum',
+           'tanh',
+           'tanh_',
+           'add_n',
+           'max',
+           'maximum',
+           'min',
+           'minimum',
+           'mm',
+           'divide',
+           'floor_divide',
+           'remainder',
+           'mod',
+           'floor_mod',
+           'multiply',
+           'add',
+           'add_',
+           'subtract',
+           'subtract_',
+           'atan',
+           'logsumexp',
+           'inverse',
+           'log1p',
+           'erf',
+           'addmm',
+           'clip',
+           'clip_',
+           'trace',
+           'kron',
+           'isfinite',
+           'isinf',
+           'isnan',
+           'broadcast_shape',
+           'conj',
+           'equal',
+           'equal_all',
+           'greater_equal',
+           'greater_than',
+           'is_empty',
+           'less_equal',
+           'less_than',
+           'logical_and',
+           'logical_not',
+           'logical_or',
+           'logical_xor',
+           'not_equal',
+           'allclose',
+           'is_tensor',
+           'cast',
+           'concat',
+           'expand',
+           'broadcast_to',
+           'expand_as',
+           'flatten',
+           'flatten_',
+           'gather',
+           'gather_nd',
+           'reshape',
+           'reshape_',
+           'reverse',
+           'scatter',
+           'scatter_',
+           'scatter_nd_add',
+           'scatter_nd',
+           'shard_index',
+           'slice',
+           'split',
+           'chunk',
+           'squeeze',
+           'squeeze_',
+           'stack',
+           'strided_slice',
+           'transpose',
+           'unique',
+           'unsqueeze',
+           'unsqueeze_',
+           'unstack',
+           'flip',
+           'unbind',
+           'roll',
+           'tile',
+           'argmax',
+           'argmin',
+           'argsort',
+           'masked_select',
+           'topk',
+           'where',
+           'index_select',
+           'nonzero',
+           'sort',
+           'index_sample',
+           'mean',
+           'std',
+           'var',
+           'numel',
+           'median',
+           'rank',
+           'shape',
+           'real',
+           'imag'
+]
diff --git a/python/paddle/tensor/array.py b/python/paddle/tensor/array.py
index ee28d47a9a9fd5..6c3d5c577e7452 100644
--- a/python/paddle/tensor/array.py
+++ b/python/paddle/tensor/array.py
@@ -16,6 +16,8 @@
 
 from ..fluid import layers
 
+__all__ = []
+
 
 def array_length(array):
     """
diff --git a/python/paddle/tensor/attribute.py b/python/paddle/tensor/attribute.py
index 499586b083fc4d..131afca0d676da 100644
--- a/python/paddle/tensor/attribute.py
+++ b/python/paddle/tensor/attribute.py
@@ -19,10 +19,10 @@
 from ..fluid.data_feeder import check_variable_and_dtype
 
 # TODO: define functions to get tensor attributes  
-from ..fluid.layers import rank  #DEFINE_ALIAS
-from ..fluid.layers import shape  #DEFINE_ALIAS
+from ..fluid.layers import rank  # noqa: F401
+from ..fluid.layers import shape  # noqa: F401
 
-__all__ = ['rank', 'shape', 'real', 'imag']
+__all__ = []
 
 
 def _complex_to_real_dtype(dtype):
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 69ee2962303831..361c0e80f90d7d 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -14,6 +14,8 @@
 
 from __future__ import print_function
 import numpy as np
+from paddle.common_ops_import import fill_constant
+from ..fluid.layers import utils
 
 from ..fluid.layers import tensor
 from ..fluid.framework import Variable
@@ -25,31 +27,11 @@
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 from ..fluid.framework import convert_np_dtype_to_dtype_, in_dygraph_mode, _varbase_creator, device_guard, OpProtoHolder
-from paddle.common_ops_import import *
 # TODO: define functions to get create a tensor  
-from ..fluid.layers import linspace  #DEFINE_ALIAS
+from ..fluid.layers import linspace  # noqa: F401
 import paddle
 
-__all__ = [
-    'to_tensor',
-    'diag',
-    #       'get_tensor_from_selected_rows',
-    'linspace',
-    'ones',
-    'ones_like',
-    'zeros',
-    'zeros_like',
-    'arange',
-    'eye',
-    'full',
-    'full_like',
-    'empty',
-    'empty_like',
-    'triu',
-    'tril',
-    'meshgrid',
-    'assign',
-]
+__all__ = []
 
 
 @dygraph_only
@@ -1036,8 +1018,10 @@ def assign(x, output=None):
     The OP copies the :attr:`x` to the :attr:`output`.
  
     Parameters:
-        x (Tensor|numpy.ndarray): A tensor or numpy ndarray, its data type supports
-            float16, float32, float64, int32 and int64.
+        x (Tensor|numpy.ndarray|list|tuple|scalar): A tensor, numpy ndarray, tuple/list of scalar,
+            or scalar. Its data type supports float16, float32, float64, int32, int64, and bool.
+            Note: the float64 data will be converted to float32 because of current platform protobuf
+            data limitation.
         output (Tensor, optional): A tensor. If :attr:`output` is None, a new tensor will
             be created as :attr:`output`. Default: None.
  
@@ -1058,5 +1042,6 @@ def assign(x, output=None):
           result2 = paddle.assign(data)  # result2 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
           result3 = paddle.assign(np.array([[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]], dtype='float32')) # result3 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
     """
-    check_type(x, 'x', (Variable, numpy.ndarray), 'assign')
+    check_type(x, 'x', (Variable, np.ndarray, list, tuple, float, int, bool),
+               'assign')
     return tensor.assign(x, output)
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 99f5bf7ba0ad1b..8aa9c9bd2bd7f9 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -13,34 +13,21 @@
 # limitations under the License.
 
 import numpy as np
-from paddle.common_ops_import import *
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type
 from ..fluid.framework import in_dygraph_mode, _varbase_creator
 
-from ..fluid.layers import transpose  #DEFINE_ALIAS
-
-__all__ = [
-    'matmul',
-    'dot',
-    #       'einsum',
-    'norm',
-    'transpose',
-    'dist',
-    't',
-    'cross',
-    'cholesky',
-    #       'tensordot',
-    'bmm',
-    'histogram',
-    'mv'
-]
+from ..fluid.layers import transpose  # noqa: F401
+from paddle.common_ops_import import core
+from paddle.common_ops_import import VarDesc
+
+__all__ = []
 
 
 def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     """
-    Applies matrix multiplication to two tensors. `matmul` follows 
-    the complete broadcast rules, 
+    Applies matrix multiplication to two tensors. `matmul` follows
+    the complete broadcast rules,
     and its behavior is consistent with `np.matmul`.
 
     Currently, the input tensors' number of dimensions can be any, `matmul` can be used to
@@ -50,8 +37,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
 
     - If a transpose flag is specified, the last two dimensions of the tensor
-      are transposed. If the tensor is ndim-1 of shape, the transpose is invalid. If the tensor 
-      is ndim-1 of shape :math:`[D]`, then for :math:`x` it is treated as :math:`[1, D]`, whereas 
+      are transposed. If the tensor is ndim-1 of shape, the transpose is invalid. If the tensor
+      is ndim-1 of shape :math:`[D]`, then for :math:`x` it is treated as :math:`[1, D]`, whereas
       for :math:`y` it is the opposite: It is treated as :math:`[D, 1]`.
 
     The multiplication behavior depends on the dimensions of `x` and `y`. Specifically:
@@ -60,22 +47,22 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
 
     - If both tensors are 2-dimensional, the matrix-matrix product is obtained.
 
-    - If the `x` is 1-dimensional and the `y` is 2-dimensional, 
-      a `1` is prepended to its dimension in order to conduct the matrix multiply. 
+    - If the `x` is 1-dimensional and the `y` is 2-dimensional,
+      a `1` is prepended to its dimension in order to conduct the matrix multiply.
       After the matrix multiply, the prepended dimension is removed.
-      
-    - If the `x` is 2-dimensional and `y` is 1-dimensional, 
+
+    - If the `x` is 2-dimensional and `y` is 1-dimensional,
       the matrix-vector product is obtained.
 
-    - If both arguments are at least 1-dimensional and at least one argument 
-      is N-dimensional (where N > 2), then a batched matrix multiply is obtained. 
-      If the first argument is 1-dimensional, a 1 is prepended to its dimension 
-      in order to conduct the batched matrix multiply and removed after. 
-      If the second argument is 1-dimensional, a 1 is appended to its 
-      dimension for the purpose of the batched matrix multiple and removed after. 
-      The non-matrix (exclude the last two dimensions) dimensions are 
-      broadcasted according the broadcast rule. 
-      For example, if input is a (j, 1, n, m) tensor and the other is a (k, m, p) tensor, 
+    - If both arguments are at least 1-dimensional and at least one argument
+      is N-dimensional (where N > 2), then a batched matrix multiply is obtained.
+      If the first argument is 1-dimensional, a 1 is prepended to its dimension
+      in order to conduct the batched matrix multiply and removed after.
+      If the second argument is 1-dimensional, a 1 is appended to its
+      dimension for the purpose of the batched matrix multiple and removed after.
+      The non-matrix (exclude the last two dimensions) dimensions are
+      broadcasted according the broadcast rule.
+      For example, if input is a (j, 1, n, m) tensor and the other is a (k, m, p) tensor,
       out will be a (j, k, n, p) tensor.
 
     Args:
@@ -177,11 +164,17 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
     Returns the matrix norm (Frobenius) or vector norm (the 1-norm, the Euclidean
     or 2-norm, and in general the p-norm for p > 0) of a given tensor.
 
+    .. note::
+        This norm API is different from `numpy.linalg.norm`.
+        This api supports high-order input tensors (rank >= 3), and certain axis need to be pointed out to calculate the norm.
+        But `numpy.linalg.norm` only supports 1-D vector or 2-D matrix as input tensor.
+        For p-order matrix norm, this api actually treats matrix as a flattened vector to calculate the vector norm, NOT REAL MATRIX NORM.
+
     Args:
         x (Tensor): The input tensor could be N-D tensor, and the input data
             type could be float32 or float64.
         p (float|string, optional): Order of the norm. Supported values are `fro`, `0`, `1`, `2`,
-            `inf`, `-inf` and any positive real number yielding the corresponding p-norm. Not supported: ord < 0 and nuclear norm. 
+            `inf`, `-inf` and any positive real number yielding the corresponding p-norm. Not supported: ord < 0 and nuclear norm.
             Default value is `fro`.
         axis (int|list|tuple, optional): The axis on which to apply norm operation. If axis is int
             or list(int)/tuple(int)  with only one element, the vector norm is computed over the axis.
@@ -198,10 +191,10 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
     Returns:
         Tensor: results of norm operation on the specified axis of input tensor,
         it's data type is the same as input's Tensor.
- 
+
     Examples:
         .. code-block:: python
-            
+
             import paddle
             import numpy as np
             shape=[2, 3, 4]
@@ -344,6 +337,10 @@ def inf_norm(input,
         return reduce_out
 
     def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
+        """
+        NOTE:
+            This function actually treats the matrix as flattened vector to calculate vector norm instead of matrix norm.
+        """
         block = LayerHelper('norm', **locals())
         out = block.create_variable_for_type_inference(
             dtype=block.input_dtype())
@@ -548,10 +545,10 @@ def dist(x, y, p=2):
 def dot(x, y, name=None):
     """
     This operator calculates inner product for vectors.
-   
+
     .. note::
-       Support 1-d and 2-d Tensor. When it is 2d, the first dimension of this matrix 
-       is the batch dimension, which means that the vectors of multiple batches are dotted. 
+       Support 1-d and 2-d Tensor. When it is 2d, the first dimension of this matrix
+       is the batch dimension, which means that the vectors of multiple batches are dotted.
 
     Parameters:
         x(Tensor): 1-D or 2-D ``Tensor``. Its dtype should be ``float32``, ``float64``, ``int32``, ``int64``
@@ -604,17 +601,17 @@ def dot(x, y, name=None):
 
 def t(input, name=None):
     """
-    Transpose <=2-D tensor. 
-    0-D and 1-D tensors are returned as it is and 2-D tensor is equal to 
+    Transpose <=2-D tensor.
+    0-D and 1-D tensors are returned as it is and 2-D tensor is equal to
     the paddle.transpose function which perm dimensions set 0 and 1.
-    
+
     Args:
         input (Tensor): The input Tensor. It is a N-D (N<=2) Tensor of data types float16, float32, float64, int32.
-        name(str, optional): The default value is None.  Normally there is no need for 
+        name(str, optional): The default value is None.  Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
     Returns:
         Tensor: A transposed n-D Tensor, with data type being float16, float32, float64, int32, int64.
-    
+
     For Example:
 
         .. code-block:: text
@@ -679,10 +676,10 @@ def t(input, name=None):
 def cross(x, y, axis=None, name=None):
     """
     Computes the cross product between two tensors along an axis.
-    
+
     Inputs must have the same shape, and the length of their axes should be equal to 3.
     If `axis` is not given, it defaults to the first axis found with the length 3.
-    
+
     Args:
         x (Tensor): The first input tensor.
         y (Tensor): The second input tensor.
@@ -691,7 +688,7 @@ def cross(x, y, axis=None, name=None):
 
     Returns:
         Tensor. A Tensor with same data type as `x`.
-        
+
     Examples:
         .. code-block:: python
 
@@ -737,8 +734,8 @@ def cross(x, y, axis=None, name=None):
 def cholesky(x, upper=False, name=None):
     r"""
     Computes the Cholesky decomposition of one symmetric positive-definite
-    matrix or batches of symmetric positive-definite matrice. 
-    
+    matrix or batches of symmetric positive-definite matrice.
+
     If `upper` is `True`, the decomposition has the form :math:`A = U^{T}U` ,
     and the returned matrix :math:`U` is upper-triangular. Otherwise, the
     decomposition has the form  :math:`A = LL^{T}` , and the returned matrix
@@ -755,7 +752,7 @@ def cholesky(x, upper=False, name=None):
     Returns:
         Tensor: A Tensor with same shape and data type as `x`. It represents \
             triangular matrices generated by Cholesky decomposition.
-        
+
     Examples:
         .. code-block:: python
 
@@ -845,7 +842,7 @@ def bmm(x, y, name=None):
 
 def histogram(input, bins=100, min=0, max=0):
     """
-    Computes the histogram of a tensor. The elements are sorted into equal width bins between min and max. 
+    Computes the histogram of a tensor. The elements are sorted into equal width bins between min and max.
     If min and max are both zero, the minimum and maximum values of the data are used.
 
     Args:
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index d5989a1b10c6a4..bdf2c477d86588 100644
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -17,33 +17,18 @@
 from ..fluid.layers.layer_function_generator import templatedoc
 from .. import fluid
 from ..fluid.framework import in_dygraph_mode
-from paddle.common_ops_import import *
 from ..framework import VarBase as Tensor
 
 # TODO: define logic functions of a tensor  
-from ..fluid.layers import is_empty  #DEFINE_ALIAS
-from ..fluid.layers import logical_and  #DEFINE_ALIAS
-from ..fluid.layers import logical_not  #DEFINE_ALIAS
-from ..fluid.layers import logical_or  #DEFINE_ALIAS
-from ..fluid.layers import logical_xor  #DEFINE_ALIAS
-
-__all__ = [
-    'equal',
-    'equal_all',
-    'greater_equal',
-    'greater_than',
-    'is_empty',
-    'less_equal',
-    'less_than',
-    'logical_and',
-    'logical_not',
-    'logical_or',
-    'logical_xor',
-    'not_equal',
-    'allclose',
-    'is_tensor'
-    #       'isnan'
-]
+from ..fluid.layers import is_empty  # noqa: F401
+from ..fluid.layers import logical_and  # noqa: F401
+from ..fluid.layers import logical_not  # noqa: F401
+from ..fluid.layers import logical_or  # noqa: F401
+from ..fluid.layers import logical_xor  # noqa: F401
+
+from paddle.common_ops_import import core
+
+__all__ = []
 
 
 def equal_all(x, y, name=None):
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 696775434b9671..97826f7d5f81d9 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -16,64 +16,61 @@
 
 from ..fluid.layers import core
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_, device_guard
+from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_, device_guard, dygraph_only
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..fluid.layers.tensor import fill_constant
 from ..fluid.layers import utils
 import numpy as np
 import six
 # TODO: define functions to manipulate a tensor  
-from ..fluid.layers import cast  #DEFINE_ALIAS
-from ..fluid.layers import slice  #DEFINE_ALIAS
-from ..fluid.layers import transpose  #DEFINE_ALIAS
-from ..fluid.layers import unstack  #DEFINE_ALIAS
+from ..fluid.layers import cast  # noqa: F401
+from ..fluid.layers import slice  # noqa: F401
+from ..fluid.layers import transpose  # noqa: F401
+from ..fluid.layers import unstack  # noqa: F401
 
-from ..fluid.layers import scatter_nd  #DEFINE_ALIAS
-from ..fluid.layers import shard_index  #DEFINE_ALIAS
+from ..fluid.layers import scatter_nd  # noqa: F401
+from ..fluid.layers import shard_index  # noqa: F401
 from ..fluid import layers
+from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
 import paddle
-import warnings
-
-__all__ = [
-    'cast',
-    'concat',
-    'expand',
-    'broadcast_to',
-    'expand_as',
-    'flatten',
-    'gather',
-    'gather_nd',
-    'reshape',
-    'reshape_',
-    'reverse',
-    'scatter',
-    'scatter_',
-    'scatter_nd_add',
-    'scatter_nd',
-    'shard_index',
-    'slice',
-    'split',
-    'chunk',
-    'squeeze',
-    'squeeze_',
-    'stack',
-    'strided_slice',
-    'transpose',
-    'unique',
-    'unsqueeze',
-    'unsqueeze_',
-    'unstack',
-    'flip',
-    'unbind',
-    'roll',
-    'tile',
-]
-
-
-def _print_warning_in_static_mode(api_name):
-    warnings.warn(
-        "In static mode, {}_() is the same as {}() and does not perform inplace operation.".
-        format(api_name, api_name))
+
+__all__ = []
+
+
+@dygraph_only
+def tolist(x):
+    """
+    **Notes**:
+        **This API is ONLY available in Dygraph mode**
+
+    This function translate the paddle.Tensor to python list.
+
+    Args:
+        x(Tensor): ``x`` is the Tensor we want to translate to list
+
+    Returns:
+        list: A list that contain the same value of current Tensor.
+
+    Returns type:
+        list: dtype is same as current Tensor
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            t = paddle.to_tensor([0,1,2,3,4])
+            expectlist = t.tolist()
+            print(expectlist)   #[0, 1, 2, 3, 4]
+
+            expectlist = paddle.tolist(t)
+            print(expectlist)   #[0, 1, 2, 3, 4]
+
+    """
+    return x.numpy().tolist()
+
+
+setattr(core.VarBase, 'tolist', tolist)
 
 
 def concat(x, axis=0, name=None):
@@ -131,7 +128,7 @@ def flip(x, axis, name=None):
     Args:
         x (Tensor): A Tensor(or LoDTensor) with shape :math:`[N_1, N_2,..., N_k]` . The data type of the input Tensor x
             should be float32, float64, int32, int64, bool.
-        axis (list): The axis(axes) to flip on. Negative indices for indexing from the end are accepted.
+        axis (list|tuple): The axis(axes) to flip on. Negative indices for indexing from the end are accepted.
         name (str, optional): The default value is None.  Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name` .
 
@@ -286,6 +283,36 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
     return out
 
 
+@inplace_apis_in_dygraph_only
+def flatten_(x, start_axis=0, stop_axis=-1, name=None):
+    """
+    Inplace version of ``flatten`` API, the output Tensor will be inplaced with input ``x``.
+    Please refer to :ref:`api_tensor_flatten`.
+    """
+    if not (isinstance(x, Variable)):
+        raise ValueError("The input x should be a Tensor")
+
+    x_dim = len(x.shape)
+    if not (isinstance(start_axis, int)) or (
+            start_axis > x_dim - 1) or start_axis < -x_dim:
+        raise ValueError(
+            "The start_axis should be a int, and in range [-rank(x), rank(x))")
+    if not (isinstance(stop_axis, int)) or (
+            stop_axis > x_dim - 1) or stop_axis < -x_dim:
+        raise ValueError(
+            "The stop_axis should be a int, and in range [-rank(x), rank(x))")
+    if start_axis < 0:
+        start_axis = start_axis + x_dim
+    if stop_axis < 0:
+        stop_axis = stop_axis + x_dim
+    if start_axis > stop_axis:
+        raise ValueError("The stop_axis should be larger than stat_axis")
+
+    dy_out, _ = core.ops.flatten_contiguous_range_(x, 'start_axis', start_axis,
+                                                   'stop_axis', stop_axis)
+    return dy_out
+
+
 def roll(x, shifts, axis=None, name=None):
     """
     Roll the `x` tensor along the given axis(axes). With specific 'shifts', Elements that 
@@ -544,7 +571,7 @@ def squeeze(x, axis=None, name=None):
 
     Args:
         x (Tensor): The input Tensor. Supported data type: float32, float64, bool, int8, int32, int64.
-        axis (int|list|tuple, optional): An integer or list of integers, indicating the dimensions to be squeezed. Default is None.
+        axis (int|list|tuple, optional): An integer or list/tuple of integers, indicating the dimensions to be squeezed. Default is None.
                           The range of axis is :math:`[-ndim(x), ndim(x))`.
                           If axis is negative, :math:`axis = axis + ndim(x)`.
                           If axis is None, all the dimensions of x of size 1 will be removed.
@@ -579,6 +606,7 @@ def squeeze(x, axis=None, name=None):
     return layers.squeeze(x, axis, name)
 
 
+@inplace_apis_in_dygraph_only
 def squeeze_(x, axis=None, name=None):
     """
     Inplace version of ``squeeze`` API, the output Tensor will be inplaced with input ``x``.
@@ -591,12 +619,8 @@ def squeeze_(x, axis=None, name=None):
     elif isinstance(axis, tuple):
         axis = list(axis)
 
-    if in_dygraph_mode():
-        out, _ = core.ops.squeeze2_(x, 'axes', axis)
-        return out
-
-    _print_warning_in_static_mode("squeeze")
-    return squeeze(x, axis, name)
+    out, _ = core.ops.squeeze2_(x, 'axes', axis)
+    return out
 
 
 def unique(x,
@@ -772,26 +796,23 @@ def unsqueeze(x, axis, name=None):
     return layers.unsqueeze(x, axis, name)
 
 
+@inplace_apis_in_dygraph_only
 def unsqueeze_(x, axis, name=None):
     """
     Inplace version of ``unsqueeze`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_paddle_tensor_unsqueeze`.
     """
-    if in_dygraph_mode():
-        if isinstance(axis, int):
-            axis = [axis]
-        elif isinstance(axis, Variable):
-            axis = axis.numpy().tolist()
-        elif isinstance(axis, (list, tuple)):
-            axis = [
-                item.numpy().item(0) if isinstance(item, Variable) else item
-                for item in axis
-            ]
-        out, _ = core.ops.unsqueeze2_(x, 'axes', axis)
-        return out
-
-    _print_warning_in_static_mode("unsqueeze")
-    return unsqueeze(x, axis, name)
+    if isinstance(axis, int):
+        axis = [axis]
+    elif isinstance(axis, Variable):
+        axis = axis.numpy().tolist()
+    elif isinstance(axis, (list, tuple)):
+        axis = [
+            item.numpy().item(0) if isinstance(item, Variable) else item
+            for item in axis
+        ]
+    out, _ = core.ops.unsqueeze2_(x, 'axes', axis)
+    return out
 
 
 def gather(x, index, axis=None, name=None):
@@ -1020,16 +1041,13 @@ def scatter(x, index, updates, overwrite=True, name=None):
     return out
 
 
+@inplace_apis_in_dygraph_only
 def scatter_(x, index, updates, overwrite=True, name=None):
     """
     Inplace version of ``scatter`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_paddle_tensor_scatter`.
     """
-    if in_dygraph_mode():
-        return core.ops.scatter_(x, index, updates, 'overwrite', overwrite)
-
-    _print_warning_in_static_mode("scatter")
-    return scatter(x, index, updates, overwrite, name)
+    return core.ops.scatter_(x, index, updates, 'overwrite', overwrite)
 
 
 def scatter_nd_add(x, index, updates, name=None):
@@ -1552,26 +1570,23 @@ def reshape(x, shape, name=None):
     return paddle.fluid.layers.reshape(x=x, shape=shape, name=name)
 
 
+@inplace_apis_in_dygraph_only
 def reshape_(x, shape, name=None):
     """
     Inplace version of ``reshape`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_paddle_tensor_reshape`.
     """
-    if in_dygraph_mode():
-        if isinstance(shape, (list, tuple)):
-            shape = [
-                item.numpy().item(0) if isinstance(item, Variable) else item
-                for item in shape
-            ]
-            out, _ = core.ops.reshape2_(x, None, 'shape', shape)
-            return out
-        elif isinstance(shape, Variable):
-            shape.stop_gradient = True
-            out, _ = core.ops.reshape2_(x, shape)
-            return out
-
-    _print_warning_in_static_mode("reshape")
-    return reshape(x, shape, name)
+    if isinstance(shape, (list, tuple)):
+        shape = [
+            item.numpy().item(0) if isinstance(item, Variable) else item
+            for item in shape
+        ]
+        out, _ = core.ops.reshape2_(x, None, 'shape', shape)
+        return out
+    elif isinstance(shape, Variable):
+        shape.stop_gradient = True
+        out, _ = core.ops.reshape2_(x, shape)
+        return out
 
 
 def gather_nd(x, index, name=None):
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 87efa9ac442b67..23addcb7e3f4e3 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -17,7 +17,12 @@
 from __future__ import print_function
 import numpy as np
 
-from paddle.common_ops_import import *
+from paddle.common_ops_import import VarDesc
+from paddle.common_ops_import import dygraph_only
+from paddle.common_ops_import import OpProtoHolder
+from paddle.common_ops_import import templatedoc
+from paddle.common_ops_import import dygraph_utils
+
 from paddle.tensor import cast
 import paddle
 from ..fluid import layers
@@ -25,112 +30,43 @@
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 from ..fluid.layers.layer_function_generator import _generate_doc_string_, generate_activation_fn, generate_layer_fn
-from .manipulation import _print_warning_in_static_mode
+from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
 
 # TODO: define math functions
 # yapf: disable
-from ..fluid.layers import abs    #DEFINE_ALIAS
-from ..fluid.layers import acos    #DEFINE_ALIAS
-from ..fluid.layers import asin    #DEFINE_ALIAS
-from ..fluid.layers import ceil    #DEFINE_ALIAS
-from ..fluid.layers import cos    #DEFINE_ALIAS
-from ..fluid.layers import tan    #DEFINE_ALIAS
-from ..fluid.layers import sinh    #DEFINE_ALIAS
-from ..fluid.layers import cosh    #DEFINE_ALIAS
-# from ..fluid.layers import elementwise_add    #DEFINE_ALIAS
-# from ..fluid.layers import elementwise_div    #DEFINE_ALIAS
-# from ..fluid.layers import elementwise_floordiv    #DEFINE_ALIAS
-# from ..fluid.layers import elementwise_mod    #DEFINE_ALIAS
-# from ..fluid.layers import elementwise_mul    #DEFINE_ALIAS
-# from ..fluid.layers import elementwise_pow    #DEFINE_ALIAS
-# from ..fluid.layers import elementwise_sub    #DEFINE_ALIAS
-from ..fluid.layers import exp    #DEFINE_ALIAS
-from ..fluid.layers import floor    #DEFINE_ALIAS
-from ..fluid.layers import log    #DEFINE_ALIAS
-from ..fluid.layers import reciprocal    #DEFINE_ALIAS
-# from ..fluid.layers import reduce_max    #DEFINE_ALIAS
-# from ..fluid.layers import reduce_min    #DEFINE_ALIAS
-# from ..fluid.layers import reduce_prod    #DEFINE_ALIAS
-# from ..fluid.layers import reduce_sum    #DEFINE_ALIAS
-from ..fluid.layers import round    #DEFINE_ALIAS
-from ..fluid.layers import rsqrt    #DEFINE_ALIAS
-from ..fluid.layers import scale    #DEFINE_ALIAS
-from ..fluid.layers import square    #DEFINE_ALIAS
-from ..fluid.layers import stanh    #DEFINE_ALIAS
-from ..fluid.layers import atan    #DEFINE_ALIAS
-from ..fluid.layers import erf    #DEFINE_ALIAS
-from ..fluid.layers import sqrt    #DEFINE_ALIAS
-from ..fluid.layers import sin    #DEFINE_ALIAS
-
-from ..fluid.layers import multiplex    #DEFINE_ALIAS
+from ..fluid.layers import abs    # noqa: F401
+from ..fluid.layers import acos    # noqa: F401
+from ..fluid.layers import asin    # noqa: F401
+from ..fluid.layers import ceil    # noqa: F401
+from ..fluid.layers import ceil_    # noqa: F401
+from ..fluid.layers import cos    # noqa: F401
+from ..fluid.layers import tan    # noqa: F401
+from ..fluid.layers import sinh    # noqa: F401
+from ..fluid.layers import cosh    # noqa: F401
+from ..fluid.layers import exp    # noqa: F401
+from ..fluid.layers import exp_    # noqa: F401
+from ..fluid.layers import floor    # noqa: F401
+from ..fluid.layers import floor_    # noqa: F401
+from ..fluid.layers import log    # noqa: F401
+from ..fluid.layers import reciprocal    # noqa: F401
+from ..fluid.layers import reciprocal_    # noqa: F401
+from ..fluid.layers import round    # noqa: F401
+from ..fluid.layers import round_    # noqa: F401
+from ..fluid.layers import rsqrt    # noqa: F401
+from ..fluid.layers import rsqrt_    # noqa: F401
+from ..fluid.layers import scale    # noqa: F401
+from ..fluid.layers import square    # noqa: F401
+from ..fluid.layers import stanh    # noqa: F401
+from ..fluid.layers import atan    # noqa: F401
+from ..fluid.layers import erf    # noqa: F401
+from ..fluid.layers import sqrt    # noqa: F401
+from ..fluid.layers import sqrt_    # noqa: F401
+from ..fluid.layers import sin    # noqa: F401
+
+from ..fluid.layers import multiplex    # noqa: F401
 from ..fluid import layers
 
-
-__all__ = [
-        'abs',
-        'acos',
-        'all',
-        'any',
-        'asin',
-        'atan',
-        'ceil',
-        'cos',
-        'cosh',
-        'cumsum',
-        'exp',
-        'floor',
-        'increment',
-        'log',
-        'log2',
-        'log10',
-        'logsumexp',
-        'mul',
-        'multiplex',
-        'pow',
-        'prod',
-        'reciprocal',
-        'round',
-        'rsqrt',
-        'scale',
-        'sign',
-        'sin',
-        'sinh',
-        'sqrt',
-        'square',
-        'stanh',
-        'sum',
-        'tanh',
-        'tanh_',
-        'add_n',
-        'max',
-        'maximum',
-        'min',
-        'minimum',
-        'mm',
-        'divide',
-        'floor_divide',
-        'remainder',
-        'mod',
-        'floor_mod',
-        'multiply',
-        'add',
-        'subtract',
-        'atan',
-        'logsumexp',
-        'inverse',
-        'log1p',
-        'erf',
-        'addmm',
-        'clip',
-        'trace',
-        'kron',
-        'isfinite',
-        'isinf',
-        'isnan',
-        'broadcast_shape',
-        'conj'
-]
-# yapf: enable.
+__all__ = []
 
 _supported_int_dtype_ = [
     VarDesc.VarType.UINT8,
@@ -145,6 +81,19 @@
     VarDesc.VarType.FP64,
 ]
 
+
+@inplace_apis_in_dygraph_only
+def scale_(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
+    """
+    Inplace version of ``scale`` API, the output Tensor will be inplaced with input ``x``.
+    Please refer to :ref:`api_tensor_scale`.
+    """
+    _scale = scale.numpy().item(0) if isinstance(scale, Variable) else scale
+    return core.ops.scale_(x, 'scale',
+                            float(_scale), 'bias',
+                            float(bias), 'bias_after_scale', bias_after_scale)
+
+
 def pow(x, y, name=None):
     """
     Compute the power of tensor elements. The equation is:
@@ -292,6 +241,24 @@ def add(x, y, name=None):
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
+@inplace_apis_in_dygraph_only
+def add_(x, y, name=None):
+    """
+    Inplace version of ``add`` API, the output Tensor will be inplaced with input ``x``.
+    Please refer to :ref:`api_tensor_add`.
+    """
+    op_type = 'elementwise_add_'
+    axis = -1
+
+    out_shape = broadcast_shape(x.shape, y.shape)
+    if out_shape != x.shape:
+        raise ValueError("The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(out_shape, x.shape))
+
+    out = _elementwise_op_in_dygraph(
+        x, y, axis=axis, op_name=op_type)
+    return out
+
+
 def subtract(x, y, name=None):
     """
     Substract two tensors element-wise. The equation is:
@@ -353,6 +320,24 @@ def subtract(x, y, name=None):
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
+@inplace_apis_in_dygraph_only
+def subtract_(x, y, name=None):
+    """
+    Inplace version of ``subtract`` API, the output Tensor will be inplaced with input ``x``.
+    Please refer to :ref:`api_tensor_subtract`.
+    """
+    axis = -1
+    act = None
+
+    out_shape = broadcast_shape(x.shape, y.shape)
+    if out_shape != x.shape:
+        raise ValueError("The shape of broadcast output {} is different from that of inplace tensor {} in the Inplace operation.".format(out_shape, x.shape))
+
+    out = _elementwise_op_in_dygraph(
+        x, y, axis=axis, act=act, op_name='elementwise_sub_')
+    return out
+
+
 def divide(x, y, name=None):
     """
     Divide two tensors element-wise. The equation is:
@@ -472,8 +457,8 @@ def remainder(x, y, name=None):
     return _elementwise_op(LayerHelper(op_type, **locals()))
 
 
-mod = remainder  #DEFINE_ALIAS
-floor_mod = remainder  #DEFINE_ALIAS
+mod = remainder  # noqa: F841
+floor_mod = remainder  # noqa: F841
 
 
 def multiply(x, y, name=None):
@@ -825,7 +810,7 @@ def add_n(inputs, name=None):
                               [14, 16, 18]]
 
     Args:
-        inputs (Tensor|list(Tensor)):  A Tensor list. The shape and data type of the list elements should be consistent.
+        inputs (Tensor|list[Tensor]|tuple[Tensor]):  A Tensor or a list/tuple of Tensors. The shape and data type of the list/tuple elements should be consistent.
             Input can be multi-dimensional Tensor, and data types can be: float32, float64, int32, int64.
         name(str, optional): The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name`
@@ -1155,7 +1140,7 @@ def max(x, axis=None, keepdim=False, name=None):
     Args:
         x(Tensor): A tensor, the data type is float32,
             float64, int32, int64.
-        axis(list|int, optional): The axis along which the maximum is computed.
+        axis(int|list|tuple, optional): The axis along which the maximum is computed.
             If :attr:`None`, compute the maximum over all elements of
             `x` and return a Tensor with a single element,
             otherwise must be in the range :math:`[-x.ndim(x), x.ndim(x))`.
@@ -1247,7 +1232,7 @@ def min(x, axis=None, keepdim=False, name=None):
 
     Args:
         x(Tensor): A tensor, the data type is float32, float64, int32, int64.
-        axis(list|int, optional): The axis along which the minimum is computed.
+        axis(int|list|tuple, optional): The axis along which the minimum is computed.
             If :attr:`None`, compute the minimum over all elements of
             `x` and return a Tensor with a single element,
             otherwise must be in the range :math:`[-x.ndim, x.ndim)`.
@@ -1475,10 +1460,10 @@ def clip(x, min=None, max=None, name=None):
         Out = MIN(MAX(x, min), max)
 
     Args:
-        x (Tensor): An N-D Tensor with data type float32 or float64.
-        min (float32|Tensor): The lower bound with type ``float32`` or a ``Tensor``
+        x (Tensor): An N-D Tensor with data type float32, float64, int32 or int64.
+        min (float|int|Tensor): The lower bound with type ``float`` , ``int`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
-        max (float32|Tensor): The upper bound with type ``float32`` or a ``Tensor``
+        max (float|int|Tensor): The upper bound with type ``float``, ``int`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
@@ -1503,16 +1488,24 @@ def clip(x, min=None, max=None, name=None):
             # [[4.5, 6.4]
     """
 
-    fmin = float(np.finfo(np.float32).min)
-    fmax = float(np.finfo(np.float32).max)
+    x_dtype = str(x.dtype)
+    if x_dtype == 'paddle.int32':
+        min_ = np.iinfo(np.int32).min
+        max_ = np.iinfo(np.int32).max - 2**7
+    elif x_dtype == 'paddle.int64':
+        min_ = np.iinfo(np.int64).min
+        max_ = np.iinfo(np.int64).max - 2**39
+    else:
+        min_ = float(np.finfo(np.float32).min)
+        max_ = float(np.finfo(np.float32).max)
 
     if in_dygraph_mode():
         if isinstance(min, Variable):
             min = min.numpy().item(0)
         if isinstance(max, Variable):
             max = max.numpy().item(0)
-        min = fmin if min is None else min
-        max = fmax if max is None else max
+        min = min_ if min is None else min
+        max = max_ if max is None else max
         return core.ops.clip(x, "min", min, "max", max)
 
     if min is not None:
@@ -1526,10 +1519,10 @@ def clip(x, min=None, max=None, name=None):
             check_dtype(max.dtype, 'max', ['float32', 'float64', 'int32'],
                         'clip', '(When the type of max in clip is Variable.)')
 
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'clip')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'], 'clip')
 
     inputs = {'X': x}
-    attrs = {'min': fmin, 'max': fmax}
+    attrs = {'min': min_, 'max': max_}
 
     if isinstance(min, Variable):
         min.stop_gradient = True
@@ -1552,6 +1545,24 @@ def clip(x, min=None, max=None, name=None):
     return output
 
 
+@inplace_apis_in_dygraph_only
+def clip_(x, min=None, max=None, name=None):
+    """
+    Inplace version of ``clip`` API, the output Tensor will be inplaced with input ``x``.
+    Please refer to :ref:`api_tensor_clip`.
+    """
+    fmin = float(np.finfo(np.float32).min)
+    fmax = float(np.finfo(np.float32).max)
+    if isinstance(min, Variable):
+        min = min.numpy().item(0)
+    if isinstance(max, Variable):
+        max = max.numpy().item(0)
+    min = fmin if min is None else min
+    max = fmax if max is None else max
+    return core.ops.clip_(x, "min", min, "max", max)
+
+
+
 def trace(x, offset=0, axis1=0, axis2=1, name=None):
     """
     **trace**
@@ -1971,16 +1982,14 @@ def tanh(x, name=None):
     helper.append_op(type='tanh', inputs={'X': x}, outputs={'Out': out})
     return out
 
+@inplace_apis_in_dygraph_only
 def tanh_(x, name=None):
     r"""
     Inplace version of ``tanh`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_tensor_tanh`.
     """
-    if in_dygraph_mode():
-        return core.ops.tanh_(x)
+    return core.ops.tanh_(x)
 
-    _print_warning_in_static_mode("tanh")
-    return tanh(x, name)
 
 def increment(x, value=1.0, name=None):
     """
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 5aca87c1507062..69a46345447633 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -21,17 +21,7 @@
 from ..fluid.layers import utils
 import paddle
 
-__all__ = [
-    'bernoulli',
-    'multinomial',
-    'standard_normal',
-    'normal',
-    'uniform',
-    'randn',
-    'rand',
-    'randint',
-    'randperm',
-]
+__all__ = []
 
 
 def bernoulli(x, name=None):
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 95f8fa449bd5ff..3d8a75f9277af1 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -16,25 +16,16 @@
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
 from ..fluid import core, layers
+from paddle.common_ops_import import in_dygraph_mode
+from paddle.common_ops_import import convert_np_dtype_to_dtype_
+from paddle.common_ops_import import Variable
+from paddle.common_ops_import import VarDesc
 
 # TODO: define searching & indexing functions of a tensor  
 # from ..fluid.layers import has_inf  #DEFINE_ALIAS
 # from ..fluid.layers import has_nan  #DEFINE_ALIAS
 
-__all__ = [
-    'argmax',
-    'argmin',
-    'argsort',
-    'masked_select',
-    'topk',
-    'where',
-    'index_select',
-    'nonzero',
-    'sort',
-    'index_sample',
-]
-
-from paddle.common_ops_import import *
+__all__ = []
 
 
 def argsort(x, axis=-1, descending=False, name=None):
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 9e565d4e5223cd..8c74360a17d05b 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -14,8 +14,6 @@
 
 # TODO: define statistical functions of a tensor  
 
-__all__ = ['mean', 'std', 'var', 'numel', 'median']
-
 import numpy as np
 from ..fluid.framework import Variable
 from ..fluid.layer_helper import LayerHelper
@@ -25,6 +23,8 @@
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 import paddle
 
+__all__ = []
+
 
 def mean(x, axis=None, keepdim=False, name=None):
     """
diff --git a/python/paddle/tensor/tensor.py b/python/paddle/tensor/tensor.py
index 478e826468167e..ec7b50c63c0862 100644
--- a/python/paddle/tensor/tensor.py
+++ b/python/paddle/tensor/tensor.py
@@ -13,9 +13,3 @@
 # limitations under the License.
 
 # TODO: define the basic tensor classes 
-
-__all__ = [
-    #       'Tensor',
-    #       'LoDTensor',
-    #       'LoDTensorArray'
-]
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index 778a391df605ec..9d07840be68825 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -17,7 +17,7 @@
 from paddle.fluid.layers import core
 from paddle.fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 
-__all__ = ['set_printoptions']
+__all__ = []
 
 
 class PrintOptions(object):
@@ -93,6 +93,10 @@ def set_printoptions(precision=None,
 def _to_sumary(var):
     edgeitems = DEFAULT_PRINT_OPTIONS.edgeitems
 
+    # Handle tensor of shape contains 0, like [0, 2], [3, 0, 3]
+    if np.prod(var.shape) == 0:
+        return np.array([])
+
     if len(var.shape) == 0:
         return var
     elif len(var.shape) == 1:
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
index 9a676b6b7396b3..bb572973fdb36e 100644
--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
@@ -49,3 +49,4 @@ set_tests_properties(test_vision_models PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataset_uci_housing PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataset_imdb PROPERTIES TIMEOUT 300)
 set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 600) 
+set_tests_properties(test_hapi_hub PROPERTIES TIMEOUT 300) 
diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/tests/hubconf.py
similarity index 63%
rename from python/paddle/fluid/contrib/utils/__init__.py
rename to python/paddle/tests/hubconf.py
index 1c1c2fb2270918..4b4a853ef2cd98 100644
--- a/python/paddle/fluid/contrib/utils/__init__.py
+++ b/python/paddle/tests/hubconf.py
@@ -1,23 +1,24 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-from . import lookup_table_utils
-from .lookup_table_utils import *
-from . import hdfs_utils
-from .hdfs_utils import *
+dependencies = ['paddle']
 
-__all__ = []
-__all__ += lookup_table_utils.__all__
-__all__ += hdfs_utils.__all__
+import paddle
+from test_hapi_hub_model import MM as _MM
+
+
+def MM(out_channels=8, pretrained=False):
+    '''This is a test demo for paddle hub
+    '''
+    return _MM(out_channels)
diff --git a/python/paddle/tests/test_dataset_cifar.py b/python/paddle/tests/test_dataset_cifar.py
index e84f73188666aa..abf79fb1e3974c 100644
--- a/python/paddle/tests/test_dataset_cifar.py
+++ b/python/paddle/tests/test_dataset_cifar.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-from paddle.vision.datasets import *
+from paddle.vision.datasets import Cifar10, Cifar100
 
 
 class TestCifar10Train(unittest.TestCase):
diff --git a/python/paddle/tests/test_dataset_conll05.py b/python/paddle/tests/test_dataset_conll05.py
index e35c04275d2047..9eb0036718b355 100644
--- a/python/paddle/tests/test_dataset_conll05.py
+++ b/python/paddle/tests/test_dataset_conll05.py
@@ -16,7 +16,7 @@
 import unittest
 import numpy as np
 
-from paddle.text.datasets import *
+from paddle.text.datasets import Conll05st
 
 
 class TestConll05st(unittest.TestCase):
diff --git a/python/paddle/tests/test_dataset_imdb.py b/python/paddle/tests/test_dataset_imdb.py
index 62c75ab232c8db..aed8c387409dce 100644
--- a/python/paddle/tests/test_dataset_imdb.py
+++ b/python/paddle/tests/test_dataset_imdb.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-from paddle.text.datasets import *
+from paddle.text.datasets import Imdb
 
 
 class TestImdbTrain(unittest.TestCase):
diff --git a/python/paddle/tests/test_dataset_imikolov.py b/python/paddle/tests/test_dataset_imikolov.py
index f4f0b8e4836772..6ffeeda73c362c 100644
--- a/python/paddle/tests/test_dataset_imikolov.py
+++ b/python/paddle/tests/test_dataset_imikolov.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-from paddle.text.datasets import *
+from paddle.text.datasets import Imikolov
 
 
 class TestImikolovTrain(unittest.TestCase):
diff --git a/python/paddle/tests/test_dataset_movielens.py b/python/paddle/tests/test_dataset_movielens.py
index 3b61fd6f5c7c22..e5c6d8376eed97 100644
--- a/python/paddle/tests/test_dataset_movielens.py
+++ b/python/paddle/tests/test_dataset_movielens.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-from paddle.text.datasets import *
+from paddle.text.datasets import Movielens
 
 
 class TestMovielensTrain(unittest.TestCase):
diff --git a/python/paddle/tests/test_dataset_uci_housing.py b/python/paddle/tests/test_dataset_uci_housing.py
index 623c7d24d09da7..bdf960b4336872 100644
--- a/python/paddle/tests/test_dataset_uci_housing.py
+++ b/python/paddle/tests/test_dataset_uci_housing.py
@@ -19,7 +19,7 @@
 import shutil
 import cv2
 
-from paddle.text.datasets import *
+from paddle.text.datasets import UCIHousing, WMT14
 
 
 class TestUCIHousingTrain(unittest.TestCase):
diff --git a/python/paddle/tests/test_dataset_wmt.py b/python/paddle/tests/test_dataset_wmt.py
index b4945cb90f991e..3e63090c9f0fff 100644
--- a/python/paddle/tests/test_dataset_wmt.py
+++ b/python/paddle/tests/test_dataset_wmt.py
@@ -15,7 +15,7 @@
 import unittest
 import numpy as np
 
-from paddle.text.datasets import *
+from paddle.text.datasets import WMT14, WMT16
 
 
 class TestWMT14Train(unittest.TestCase):
diff --git a/python/paddle/tests/test_datasets.py b/python/paddle/tests/test_datasets.py
index 89fa01cbceb45c..c93bac3ac27e85 100644
--- a/python/paddle/tests/test_datasets.py
+++ b/python/paddle/tests/test_datasets.py
@@ -20,7 +20,7 @@
 import cv2
 
 import paddle.vision.transforms as T
-from paddle.vision.datasets import *
+from paddle.vision.datasets import DatasetFolder, ImageFolder, MNIST, FashionMNIST, Flowers
 from paddle.dataset.common import _check_exists_and_download
 
 
diff --git a/python/paddle/tests/test_hapi_hub.py b/python/paddle/tests/test_hapi_hub.py
new file mode 100644
index 00000000000000..06000d6c83367e
--- /dev/null
+++ b/python/paddle/tests/test_hapi_hub.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import os
+
+import paddle
+from paddle.hapi import hub
+
+import numpy as np
+
+
+class TestHub(unittest.TestCase):
+    def setUp(self, ):
+        self.local_repo = os.path.dirname(os.path.abspath(__file__))
+        self.github_repo = 'lyuwenyu/paddlehub_demo:main'
+
+    def testLoad(self, ):
+        model = hub.load(
+            self.local_repo, model='MM', source='local', out_channels=8)
+
+        data = paddle.rand((1, 3, 100, 100))
+        out = model(data)
+        np.testing.assert_equal(out.shape, [1, 8, 50, 50])
+
+        model = hub.load(
+            self.github_repo, model='MM', source='github', force_reload=True)
+
+        model = hub.load(
+            self.github_repo,
+            model='MM',
+            source='github',
+            force_reload=False,
+            pretrained=False)
+
+        model = hub.load(
+            self.github_repo.split(':')[0],
+            model='MM',
+            source='github',
+            force_reload=False,
+            pretrained=False)
+
+        model = hub.load(
+            self.github_repo,
+            model='MM',
+            source='github',
+            force_reload=False,
+            pretrained=True,
+            out_channels=8)
+
+        data = paddle.ones((1, 3, 2, 2))
+        out = model(data)
+
+        gt = np.array([
+            1.53965068, 0., 0., 1.39455748, 0.72066200, 0.19773030, 2.09201908,
+            0.37345418
+        ])
+        np.testing.assert_equal(out.shape, [1, 8, 1, 1])
+        np.testing.assert_almost_equal(
+            out.numpy(), gt.reshape(1, 8, 1, 1), decimal=5)
+
+    def testHelp(self, ):
+        docs1 = hub.help(
+            self.local_repo,
+            model='MM',
+            source='local', )
+
+        docs2 = hub.help(
+            self.github_repo, model='MM', source='github', force_reload=False)
+
+        assert docs1 == docs2 == 'This is a test demo for paddle hub\n    ', ''
+
+    def testList(self, ):
+        models1 = hub.list(
+            self.local_repo,
+            source='local',
+            force_reload=False, )
+
+        models2 = hub.list(
+            self.github_repo,
+            source='github',
+            force_reload=False, )
+
+        assert models1 == models2 == ['MM'], ''
+
+    def testExcept(self, ):
+        with self.assertRaises(ValueError):
+            _ = hub.help(
+                self.github_repo,
+                model='MM',
+                source='github-test',
+                force_reload=False)
+
+        with self.assertRaises(ValueError):
+            _ = hub.load(
+                self.github_repo,
+                model='MM',
+                source='github-test',
+                force_reload=False)
+
+        with self.assertRaises(ValueError):
+            _ = hub.list(
+                self.github_repo, source='github-test', force_reload=False)
+
+        with self.assertRaises(ValueError):
+            _ = hub.load(
+                self.local_repo, model=123, source='local', force_reload=False)
+
+        with self.assertRaises(RuntimeError):
+            _ = hub.load(
+                self.local_repo,
+                model='123',
+                source='local',
+                force_reload=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/test_hapi_hub_model.py b/python/paddle/tests/test_hapi_hub_model.py
new file mode 100644
index 00000000000000..774c7f6f33a65d
--- /dev/null
+++ b/python/paddle/tests/test_hapi_hub_model.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class MM(nn.Layer):
+    def __init__(self, out_channels):
+        super(MM, self).__init__()
+        self.conv = nn.Conv2D(3, out_channels, 3, 2, 1)
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = F.relu(out)
+
+        return out
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 10ceb487969038..ae574a8241bfff 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -172,6 +172,12 @@ def test_fit_dygraph(self):
     def test_fit_static(self):
         self.fit(False)
 
+    def test_fit_dynamic_with_tuple_input(self):
+        self.fit_with_tuple_input(True)
+
+    def test_fit_static_with_tuple_input(self):
+        self.fit_with_tuple_input(False)
+
     def test_fit_dynamic_with_rank(self):
         self.fit(True, 2, 0)
 
@@ -240,6 +246,53 @@ def fit(self, dynamic, num_replicas=None, rank=None):
         model.fit(train_loader, val_loader)
         fluid.disable_dygraph() if dynamic else None
 
+    def fit_with_tuple_input(self, dynamic, num_replicas=None, rank=None):
+        fluid.enable_dygraph(self.device) if dynamic else None
+        seed = 333
+        paddle.seed(seed)
+        paddle.framework.random._manual_program_seed(seed)
+
+        net = LeNet()
+        optim_new = fluid.optimizer.Adam(
+            learning_rate=0.001, parameter_list=net.parameters())
+        model = Model(net, inputs=tuple(self.inputs), labels=tuple(self.labels))
+        model.prepare(
+            optim_new,
+            loss=CrossEntropyLoss(reduction="sum"),
+            metrics=Accuracy())
+        model.fit(self.train_dataset, batch_size=64, shuffle=False)
+
+        result = model.evaluate(self.val_dataset, batch_size=64)
+        np.testing.assert_allclose(result['acc'], self.acc1)
+
+        train_sampler = DistributedBatchSampler(
+            self.train_dataset,
+            batch_size=64,
+            shuffle=False,
+            num_replicas=num_replicas,
+            rank=rank)
+        val_sampler = DistributedBatchSampler(
+            self.val_dataset,
+            batch_size=64,
+            shuffle=False,
+            num_replicas=num_replicas,
+            rank=rank)
+
+        train_loader = fluid.io.DataLoader(
+            self.train_dataset,
+            batch_sampler=train_sampler,
+            places=self.device,
+            return_list=True)
+
+        val_loader = fluid.io.DataLoader(
+            self.val_dataset,
+            batch_sampler=val_sampler,
+            places=self.device,
+            return_list=True)
+
+        model.fit(train_loader, val_loader)
+        fluid.disable_dygraph() if dynamic else None
+
     def evaluate(self, dynamic):
         fluid.enable_dygraph(self.device) if dynamic else None
         model = Model(LeNet(), self.inputs, self.labels)
diff --git a/python/paddle/tests/test_read_file.py b/python/paddle/tests/test_read_file.py
new file mode 100644
index 00000000000000..fbcba9a6bbf7b4
--- /dev/null
+++ b/python/paddle/tests/test_read_file.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cv2
+import shutil
+import unittest
+import numpy as np
+
+import paddle
+from paddle.vision.ops import read_file, decode_jpeg
+
+
+class TestReadFile(unittest.TestCase):
+    def setUp(self):
+        fake_img = (np.random.random((400, 300, 3)) * 255).astype('uint8')
+        cv2.imwrite('fake.jpg', fake_img)
+
+    def tearDown(self):
+        os.remove('fake.jpg')
+
+    def read_file_decode_jpeg(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        img_bytes = read_file('fake.jpg')
+
+        img = decode_jpeg(img_bytes, mode='gray')
+        img = decode_jpeg(img_bytes, mode='rgb')
+
+        img = decode_jpeg(img_bytes)
+
+        img_cv2 = cv2.imread('fake.jpg')
+        if paddle.in_dynamic_mode():
+            np.testing.assert_equal(img.shape, img_cv2.transpose(2, 0, 1).shape)
+        else:
+            place = paddle.CUDAPlace(0)
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            out = exe.run(paddle.static.default_main_program(),
+                          fetch_list=[img])
+
+            np.testing.assert_equal(out[0].shape,
+                                    img_cv2.transpose(2, 0, 1).shape)
+
+    def test_read_file_decode_jpeg_dynamic(self):
+        self.read_file_decode_jpeg()
+
+    def test_read_file_decode_jpeg_static(self):
+        paddle.enable_static()
+        self.read_file_decode_jpeg()
+        paddle.disable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/test_transforms.py b/python/paddle/tests/test_transforms.py
index 47977bdf5352bb..c84950fdbc539d 100644
--- a/python/paddle/tests/test_transforms.py
+++ b/python/paddle/tests/test_transforms.py
@@ -56,7 +56,10 @@ def create_image(self, shape):
                 'uint8'))
 
     def get_shape(self, img):
-        if self.backend == 'pil':
+        if isinstance(img, paddle.Tensor):
+            return img.shape
+
+        elif self.backend == 'pil':
             return np.array(img).shape
 
         return img.shape
@@ -253,6 +256,22 @@ def test_exception(self):
             fake_img = self.create_image((100, 120, 3))
             F.pad(fake_img, [1.0, 2.0, 3.0])
 
+        with self.assertRaises(TypeError):
+            tensor_img = paddle.rand((3, 100, 100))
+            F.pad(tensor_img, '1')
+
+        with self.assertRaises(TypeError):
+            tensor_img = paddle.rand((3, 100, 100))
+            F.pad(tensor_img, 1, {})
+
+        with self.assertRaises(TypeError):
+            tensor_img = paddle.rand((3, 100, 100))
+            F.pad(tensor_img, 1, padding_mode=-1)
+
+        with self.assertRaises(ValueError):
+            tensor_img = paddle.rand((3, 100, 100))
+            F.pad(tensor_img, [1.0, 2.0, 3.0])
+
         with self.assertRaises(ValueError):
             transforms.RandomRotation(-2)
 
@@ -290,6 +309,159 @@ def get_backend(self):
         return 'pil'
 
 
+class TestTransformsTensor(TestTransformsCV2):
+    def get_backend(self):
+        return 'tensor'
+
+    def create_image(self, shape):
+        return paddle.to_tensor(np.random.rand(*shape)).transpose(
+            (2, 0, 1))  # hwc->chw
+
+    def do_transform(self, trans):
+        trans.transforms.insert(0, transforms.ToTensor(data_format='CHW'))
+        trans.transforms.append(transforms.Transpose(order=(1, 2, 0)))
+        dataset_folder = DatasetFolder(self.data_dir, transform=trans)
+        for _ in dataset_folder:
+            pass
+
+    def test_trans_all(self):
+        normalize = transforms.Normalize(
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.120, 57.375], )
+        trans = transforms.Compose([
+            transforms.RandomResizedCrop(224),
+            transforms.RandomHorizontalFlip(),
+            normalize,
+        ])
+        self.do_transform(trans)
+
+    def test_grayscale(self):
+        trans = transforms.Compose([transforms.Grayscale()])
+        self.do_transform(trans)
+
+        trans_gray = transforms.Grayscale()
+        fake_img = self.create_image((500, 400, 3))
+        fake_img_gray = trans_gray(fake_img)
+
+        np.testing.assert_equal(self.get_shape(fake_img_gray)[1], 500)
+        np.testing.assert_equal(self.get_shape(fake_img_gray)[2], 400)
+
+        trans_gray3 = transforms.Grayscale(3)
+        fake_img = self.create_image((500, 400, 3))
+        fake_img_gray = trans_gray3(fake_img)
+
+    def test_normalize(self):
+        normalize = transforms.Normalize(mean=0.5, std=0.5)
+        trans = transforms.Compose([normalize])
+        self.do_transform(trans)
+
+    def test_pad(self):
+        trans = transforms.Compose([transforms.Pad(2)])
+        self.do_transform(trans)
+
+        fake_img = self.create_image((200, 150, 3))
+        trans_pad = transforms.Compose([transforms.Pad(10)])
+        fake_img_padded = trans_pad(fake_img)
+        np.testing.assert_equal(self.get_shape(fake_img_padded), (3, 220, 170))
+        trans_pad1 = transforms.Pad([1, 2])
+        trans_pad2 = transforms.Pad([1, 2, 3, 4])
+        trans_pad4 = transforms.Pad(1, padding_mode='edge')
+        img = trans_pad1(fake_img)
+        img = trans_pad2(img)
+        img = trans_pad4(img)
+
+    def test_random_crop(self):
+        trans = transforms.Compose([
+            transforms.RandomCrop(200),
+            transforms.RandomCrop((140, 160)),
+        ])
+        self.do_transform(trans)
+
+        trans_random_crop1 = transforms.RandomCrop(224)
+        trans_random_crop2 = transforms.RandomCrop((140, 160))
+
+        fake_img = self.create_image((500, 400, 3))
+        fake_img_crop1 = trans_random_crop1(fake_img)
+        fake_img_crop2 = trans_random_crop2(fake_img_crop1)
+
+        np.testing.assert_equal(self.get_shape(fake_img_crop1), (3, 224, 224))
+
+        np.testing.assert_equal(self.get_shape(fake_img_crop2), (3, 140, 160))
+
+        trans_random_crop_same = transforms.RandomCrop((140, 160))
+        img = trans_random_crop_same(fake_img_crop2)
+
+        trans_random_crop_bigger = transforms.RandomCrop(
+            (180, 200), pad_if_needed=True)
+        img = trans_random_crop_bigger(img)
+
+        trans_random_crop_pad = transforms.RandomCrop((224, 256), 2, True)
+        img = trans_random_crop_pad(img)
+
+    def test_exception(self):
+        trans = transforms.Compose([transforms.Resize(-1)])
+
+        trans_batch = transforms.Compose([transforms.Resize(-1)])
+
+        with self.assertRaises(Exception):
+            self.do_transform(trans)
+
+        with self.assertRaises(Exception):
+            self.do_transform(trans_batch)
+
+        with self.assertRaises(ValueError):
+            transforms.Pad([1.0, 2.0, 3.0])
+
+        with self.assertRaises(TypeError):
+            fake_img = self.create_image((100, 120, 3))
+            F.pad(fake_img, '1')
+
+        with self.assertRaises(TypeError):
+            fake_img = self.create_image((100, 120, 3))
+            F.pad(fake_img, 1, {})
+
+        with self.assertRaises(TypeError):
+            fake_img = self.create_image((100, 120, 3))
+            F.pad(fake_img, 1, padding_mode=-1)
+
+        with self.assertRaises(ValueError):
+            fake_img = self.create_image((100, 120, 3))
+            F.pad(fake_img, [1.0, 2.0, 3.0])
+
+        with self.assertRaises(TypeError):
+            tensor_img = paddle.rand((3, 100, 100))
+            F.pad(tensor_img, '1')
+
+        with self.assertRaises(TypeError):
+            tensor_img = paddle.rand((3, 100, 100))
+            F.pad(tensor_img, 1, {})
+
+        with self.assertRaises(TypeError):
+            tensor_img = paddle.rand((3, 100, 100))
+            F.pad(tensor_img, 1, padding_mode=-1)
+
+        with self.assertRaises(ValueError):
+            tensor_img = paddle.rand((3, 100, 100))
+            F.pad(tensor_img, [1.0, 2.0, 3.0])
+
+        with self.assertRaises(ValueError):
+            transforms.RandomRotation(-2)
+
+        with self.assertRaises(ValueError):
+            transforms.RandomRotation([1, 2, 3])
+
+        with self.assertRaises(ValueError):
+            trans_gray = transforms.Grayscale(5)
+            fake_img = self.create_image((100, 120, 3))
+            trans_gray(fake_img)
+
+        with self.assertRaises(TypeError):
+            transform = transforms.RandomResizedCrop(64)
+            transform(1)
+
+    test_color_jitter = None
+
+
 class TestFunctional(unittest.TestCase):
     def test_errors(self):
         with self.assertRaises(TypeError):
@@ -300,6 +472,14 @@ def test_errors(self):
                 'uint8'))
             F.to_tensor(fake_img, data_format=1)
 
+        with self.assertRaises(ValueError):
+            fake_img = paddle.rand((3, 100, 100))
+            F.pad(fake_img, 1, padding_mode='symmetric')
+
+        with self.assertRaises(TypeError):
+            fake_img = paddle.rand((3, 100, 100))
+            F.resize(fake_img, {1: 1})
+
         with self.assertRaises(TypeError):
             fake_img = Image.fromarray((np.random.rand(28, 28, 3) * 255).astype(
                 'uint8'))
@@ -354,31 +534,50 @@ def test_normalize(self):
         std = [0.5, 0.5, 0.5]
 
         normalized_img = F.normalize(tensor_img, mean, std)
-        normalized_img = F.normalize(
+        normalized_img_tensor = F.normalize(
             tensor_img_hwc, mean, std, data_format='HWC')
 
-        normalized_img = F.normalize(pil_img, mean, std, data_format='HWC')
-        normalized_img = F.normalize(
+        normalized_img_pil = F.normalize(pil_img, mean, std, data_format='HWC')
+        normalized_img_np = F.normalize(
             np_img, mean, std, data_format='HWC', to_rgb=True)
 
+        np.testing.assert_almost_equal(
+            np.array(normalized_img_pil), normalized_img_np)
+        np.testing.assert_almost_equal(normalized_img_tensor.numpy(),
+                                       normalized_img_np)
+
     def test_center_crop(self):
         np_img = (np.random.rand(28, 24, 3)).astype('uint8')
         pil_img = Image.fromarray(np_img)
+        tensor_img = F.to_tensor(pil_img, data_format='CHW')
 
         np_cropped_img = F.center_crop(np_img, 4)
         pil_cropped_img = F.center_crop(pil_img, 4)
+        tensor_cropped_img = F.center_crop(tensor_img, 4)
 
         np.testing.assert_almost_equal(np_cropped_img,
                                        np.array(pil_cropped_img))
+        np.testing.assert_almost_equal(np_cropped_img,
+                                       tensor_cropped_img.numpy().transpose(
+                                           (1, 2, 0)))
 
     def test_pad(self):
         np_img = (np.random.rand(28, 24, 3)).astype('uint8')
         pil_img = Image.fromarray(np_img)
+        tensor_img = F.to_tensor(pil_img, 'CHW')
 
         np_padded_img = F.pad(np_img, [1, 2], padding_mode='reflect')
         pil_padded_img = F.pad(pil_img, [1, 2], padding_mode='reflect')
+        tensor_padded_img = F.pad(tensor_img, [1, 2], padding_mode='reflect')
 
         np.testing.assert_almost_equal(np_padded_img, np.array(pil_padded_img))
+        np.testing.assert_almost_equal(np_padded_img,
+                                       tensor_padded_img.numpy().transpose(
+                                           (1, 2, 0)))
+
+        tensor_padded_img = F.pad(tensor_img, 1, padding_mode='reflect')
+        tensor_padded_img = F.pad(tensor_img, [1, 2, 1, 2],
+                                  padding_mode='reflect')
 
         pil_p_img = pil_img.convert('P')
         pil_padded_img = F.pad(pil_p_img, [1, 2])
@@ -387,12 +586,21 @@ def test_pad(self):
     def test_resize(self):
         np_img = (np.zeros([28, 24, 3])).astype('uint8')
         pil_img = Image.fromarray(np_img)
+        tensor_img = F.to_tensor(pil_img, 'CHW')
 
         np_reseized_img = F.resize(np_img, 40)
         pil_reseized_img = F.resize(pil_img, 40)
+        tensor_reseized_img = F.resize(tensor_img, 40)
+        tensor_reseized_img2 = F.resize(tensor_img, (46, 40))
 
         np.testing.assert_almost_equal(np_reseized_img,
                                        np.array(pil_reseized_img))
+        np.testing.assert_almost_equal(np_reseized_img,
+                                       tensor_reseized_img.numpy().transpose(
+                                           (1, 2, 0)))
+        np.testing.assert_almost_equal(np_reseized_img,
+                                       tensor_reseized_img2.numpy().transpose(
+                                           (1, 2, 0)))
 
         gray_img = (np.zeros([28, 32])).astype('uint8')
         gray_resize_img = F.resize(gray_img, 40)
@@ -447,10 +655,34 @@ def test_image_load(self):
     def test_rotate(self):
         np_img = (np.random.rand(28, 28, 3) * 255).astype('uint8')
         pil_img = Image.fromarray(np_img).convert('RGB')
-
         rotated_np_img = F.rotate(np_img, 80, expand=True)
         rotated_pil_img = F.rotate(pil_img, 80, expand=True)
 
+        tensor_img = F.to_tensor(pil_img, 'CHW')
+
+        rotated_tensor_img1 = F.rotate(tensor_img, 80, expand=True)
+
+        rotated_tensor_img2 = F.rotate(
+            tensor_img,
+            80,
+            interpolation='bilinear',
+            center=(10, 10),
+            expand=False)
+
+        np.testing.assert_equal(rotated_np_img.shape,
+                                np.array(rotated_pil_img).shape)
+        np.testing.assert_equal(rotated_np_img.shape,
+                                rotated_tensor_img1.transpose((1, 2, 0)).shape)
+
+    def test_rotate1(self):
+        np_img = (np.random.rand(28, 28, 3) * 255).astype('uint8')
+        pil_img = Image.fromarray(np_img).convert('RGB')
+
+        rotated_np_img = F.rotate(
+            np_img, 80, expand=True, center=[0, 0], fill=[0, 0, 0])
+        rotated_pil_img = F.rotate(
+            pil_img, 80, expand=True, center=[0, 0], fill=[0, 0, 0])
+
         np.testing.assert_equal(rotated_np_img.shape,
                                 np.array(rotated_pil_img).shape)
 
diff --git a/python/paddle/text/__init__.py b/python/paddle/text/__init__.py
index b6f8ea6bcc7e46..00eaae5b29e93f 100644
--- a/python/paddle/text/__init__.py
+++ b/python/paddle/text/__init__.py
@@ -12,7 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import datasets
-from .datasets import *
+from .datasets import Conll05st  # noqa: F401
+from .datasets import Imdb  # noqa: F401
+from .datasets import Imikolov  # noqa: F401
+from .datasets import Movielens  # noqa: F401
+from .datasets import UCIHousing  # noqa: F401
+from .datasets import WMT14  # noqa: F401
+from .datasets import WMT16  # noqa: F401
 
-__all__ = datasets.__all__
+
+__all__ = [ #noqa
+           'Conll05st',
+           'Imdb',
+           'Imikolov',
+           'Movielens',
+           'UCIHousing',
+           'WMT14',
+           'WMT16'
+]
diff --git a/python/paddle/text/datasets/__init__.py b/python/paddle/text/datasets/__init__.py
index 71571d09b5c2bd..118917049928bf 100644
--- a/python/paddle/text/datasets/__init__.py
+++ b/python/paddle/text/datasets/__init__.py
@@ -12,26 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from . import conll05
-from . import imdb
-from . import imikolov
-from . import movielens
-from . import uci_housing
-from . import wmt14
-from . import wmt16
+from .conll05 import Conll05st  # noqa: F401
+from .imdb import Imdb  # noqa: F401
+from .imikolov import Imikolov  # noqa: F401
+from .movielens import Movielens  # noqa: F401
+from .uci_housing import UCIHousing  # noqa: F401
+from .wmt14 import WMT14  # noqa: F401
+from .wmt16 import WMT16  # noqa: F401
 
-from .conll05 import *
-from .imdb import *
-from .imikolov import *
-from .movielens import *
-from .uci_housing import *
-from .wmt14 import *
-from .wmt16 import *
-
-__all__ = conll05.__all__ \
-          + imdb.__all__ \
-          + imikolov.__all__ \
-          + movielens.__all__ \
-          + uci_housing.__all__ \
-          + wmt14.__all__ \
-          + wmt16.__all__
+__all__ = []
diff --git a/python/paddle/text/datasets/conll05.py b/python/paddle/text/datasets/conll05.py
index 23a2f1c8f28a55..7dd29637706f32 100644
--- a/python/paddle/text/datasets/conll05.py
+++ b/python/paddle/text/datasets/conll05.py
@@ -24,7 +24,7 @@
 import paddle.compat as cpt
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ['Conll05st']
+__all__ = []
 
 DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
diff --git a/python/paddle/text/datasets/imdb.py b/python/paddle/text/datasets/imdb.py
index 142c70c953b4d2..f4fe7eb174bb75 100644
--- a/python/paddle/text/datasets/imdb.py
+++ b/python/paddle/text/datasets/imdb.py
@@ -24,7 +24,7 @@
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ['Imdb']
+__all__ = []
 
 URL = 'https://dataset.bj.bcebos.com/imdb%2FaclImdb_v1.tar.gz'
 MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
diff --git a/python/paddle/text/datasets/imikolov.py b/python/paddle/text/datasets/imikolov.py
index 1a1c625f6058e6..9c84669d6b8d8d 100644
--- a/python/paddle/text/datasets/imikolov.py
+++ b/python/paddle/text/datasets/imikolov.py
@@ -22,7 +22,7 @@
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ['Imikolov']
+__all__ = []
 
 URL = 'https://dataset.bj.bcebos.com/imikolov%2Fsimple-examples.tgz'
 MD5 = '30177ea32e27c525793142b6bf2c8e2d'
diff --git a/python/paddle/text/datasets/movielens.py b/python/paddle/text/datasets/movielens.py
index 1f399eebd3b529..798a7c590e17b6 100644
--- a/python/paddle/text/datasets/movielens.py
+++ b/python/paddle/text/datasets/movielens.py
@@ -26,7 +26,7 @@
 import paddle.compat as cpt
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ['Movielens']
+__all__ = []
 
 age_table = [1, 18, 25, 35, 45, 50, 56]
 
diff --git a/python/paddle/text/datasets/uci_housing.py b/python/paddle/text/datasets/uci_housing.py
index a8dfbc44a97127..597b1e1e8185ef 100644
--- a/python/paddle/text/datasets/uci_housing.py
+++ b/python/paddle/text/datasets/uci_housing.py
@@ -21,7 +21,7 @@
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ["UCIHousing"]
+__all__ = []
 
 URL = 'http://paddlemodels.bj.bcebos.com/uci_housing/housing.data'
 MD5 = 'd4accdce7a25600298819f8e28e8d593'
diff --git a/python/paddle/text/datasets/wmt14.py b/python/paddle/text/datasets/wmt14.py
index b080824d72410e..424a564216d190 100644
--- a/python/paddle/text/datasets/wmt14.py
+++ b/python/paddle/text/datasets/wmt14.py
@@ -22,7 +22,7 @@
 import paddle.compat as cpt
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ['WMT14']
+__all__ = []
 
 URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
                 'cslm_joint_paper/data/dev+test.tgz')
diff --git a/python/paddle/text/datasets/wmt16.py b/python/paddle/text/datasets/wmt16.py
index 03a62e93470351..f95cbe771cadc8 100644
--- a/python/paddle/text/datasets/wmt16.py
+++ b/python/paddle/text/datasets/wmt16.py
@@ -27,7 +27,7 @@
 import paddle.compat as cpt
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ['WMT16']
+__all__ = []
 
 DATA_URL = ("http://paddlemodels.bj.bcebos.com/wmt/wmt16.tar.gz")
 DATA_MD5 = "0c38be43600334966403524a40dcd81e"
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index d32fa4c88c4fee..40c9d415e11f1b 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -12,21 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .profiler import ProfilerOptions
-from .profiler import Profiler
-from .profiler import get_profiler
-from .deprecated import deprecated
-from .lazy_import import try_import
-from .op_version import OpLastCheckpointChecker
-from .install_check import run_check
-from ..fluid.framework import unique_name
-from ..fluid.framework import require_version
+from .profiler import ProfilerOptions  # noqa: F401
+from .profiler import Profiler  # noqa: F401
+from .profiler import get_profiler  # noqa: F401
+from .deprecated import deprecated  # noqa: F401
+from .lazy_import import try_import  # noqa: F401
+from .op_version import OpLastCheckpointChecker  # noqa: F401
+from .install_check import run_check  # noqa: F401
+from ..fluid.framework import unique_name  # noqa: F401
+from ..fluid.framework import require_version  # noqa: F401
 
-from . import download
+from . import download  # noqa: F401
+from . import image_util  # noqa: F401
+from . import cpp_extension  # noqa: F401
 
-from . import cpp_extension
-
-__all__ = ['dump_config', 'deprecated', 'download', 'run_check']
-
-#TODO: define new api under this directory
-__all__ += ['unique_name', 'require_version']
+__all__ = [     #noqa
+           'deprecated',
+           'download',
+           'run_check',
+           'unique_name',
+           'require_version',
+           'try_import'
+]
diff --git a/python/paddle/utils/cpp_extension/__init__.py b/python/paddle/utils/cpp_extension/__init__.py
index 130ab79b3038df..cef2716b7f3960 100644
--- a/python/paddle/utils/cpp_extension/__init__.py
+++ b/python/paddle/utils/cpp_extension/__init__.py
@@ -12,18 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .cpp_extension import CUDAExtension
-from .cpp_extension import CppExtension
-from .cpp_extension import BuildExtension
-from .cpp_extension import load, setup
+from .cpp_extension import CUDAExtension  # noqa: F401
+from .cpp_extension import CppExtension  # noqa: F401
+from .cpp_extension import BuildExtension  # noqa: F401
+from .cpp_extension import load  # noqa: F401
+from .cpp_extension import setup  # noqa: F401
 
-from .extension_utils import parse_op_info
-from .extension_utils import get_build_directory
-from .extension_utils import load_op_meta_info_and_register_op
+from .extension_utils import parse_op_info  # noqa: F401
+from .extension_utils import get_build_directory  # noqa: F401
+from .extension_utils import load_op_meta_info_and_register_op  # noqa: F401
 
-from . import cpp_extension
-from . import extension_utils
-
-__all__ = [
-    'CppExtension', 'CUDAExtension', 'load', 'setup', 'get_build_directory'
+__all__ = [ #noqa
+        'CppExtension',
+        'CUDAExtension',
+        'load',
+        'setup',
+        'get_build_directory'
 ]
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index ab528cdb0c0d90..6045ac7d1e7274 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -26,7 +26,7 @@
 from .extension_utils import is_cuda_file, prepare_unix_cudaflags, prepare_win_cudaflags
 from .extension_utils import _import_module_from_library, _write_setup_file, _jit_compile
 from .extension_utils import check_abi_compatibility, log_v, CustomOpInfo, parse_op_name_from
-from .extension_utils import clean_object_if_change_cflags, _reset_so_rpath
+from .extension_utils import clean_object_if_change_cflags, _reset_so_rpath, _get_fluid_path
 from .extension_utils import bootstrap_context, get_build_directory, add_std_without_repeat
 
 from .extension_utils import IS_WINDOWS, OS_NAME, MSVC_COMPILE_FLAGS, MSVC_COMPILE_FLAGS
@@ -69,7 +69,7 @@ def setup(**attr):
     For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, 
     then the version of user's local machine should satisfy GCC >= 8.2. 
     For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of 
-    PaddlePaddle (Visual Studio 2015 update3). 
+    PaddlePaddle (Visual Studio 2017). 
     If the above conditions are not met, the corresponding warning will be printed, and a fatal error may 
     occur because of ABI compatibility.
 
@@ -79,7 +79,7 @@ def setup(**attr):
         2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
            Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking 
            GCC version.
-        3. On Windows platform, we recommend to install `` Visual Studio`` (>=2015 update3).
+        3. On Windows platform, we recommend to install `` Visual Studio`` (>=2017).
 
 
     Compared with Just-In-Time ``load`` interface, it only compiles once by executing
@@ -611,7 +611,7 @@ def _check_abi(self):
             msg = (
                 'It seems that the VC environment is activated but DISTUTILS_USE_SDK is not set.'
                 'This may lead to multiple activations of the VC env.'
-                'Please set `DISTUTILS_USE_SDK=1` and try again.')
+                'Please run `set DISTUTILS_USE_SDK=1` and try again.')
             raise UserWarning(msg)
 
     def _record_op_info(self):
@@ -724,7 +724,7 @@ def load(name,
     processes under a individual subprocess. It does not require CMake or Ninja 
     environment. On Linux platform, it requires GCC compiler whose version is 
     greater than 5.4 and it should be soft linked to ``/usr/bin/cc`` . On Windows 
-    platform, it requires Visual Studio whose version is greater than 2015 update3.
+    platform, it requires Visual Studio whose version is greater than 2017.
     On MacOS, clang++ is requited. In addition, if compiling Operators supporting 
     GPU device, please make sure ``nvcc`` compiler is installed in local environment.
     
@@ -735,7 +735,7 @@ def load(name,
     For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, 
     then the version of user's local machine should satisfy GCC >= 8.2. 
     For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of 
-    PaddlePaddle (Visual Studio 2015 update3). 
+    PaddlePaddle (Visual Studio 2017). 
     If the above conditions are not met, the corresponding warning will be printed, and a fatal error may 
     occur because of ABI compatibility.
 
@@ -749,7 +749,7 @@ def load(name,
         2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
            Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking 
            GCC version.
-        3. On Windows platform, we recommend to install `` Visual Studio`` (>=2015 update3).
+        3. On Windows platform, we recommend to install `` Visual Studio`` (>=2017).
 
 
     **A simple example:**
@@ -802,9 +802,6 @@ def load(name,
 
     # ensure to use abs path
     build_directory = os.path.abspath(build_directory)
-    # Will load shared library from 'path' on windows
-    if IS_WINDOWS:
-        os.environ['path'] = build_directory + ';' + os.environ['path']
 
     log_v("build_directory: {}".format(build_directory), verbose)
 
@@ -827,6 +824,7 @@ def load(name,
 
     # write setup.py file and compile it
     build_base_dir = os.path.join(build_directory, name)
+
     _write_setup_file(name, sources, file_path, build_base_dir,
                       extra_include_paths, extra_cxx_cflags, extra_cuda_cflags,
                       extra_ldflags, verbose)
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 30ff3f81ca7af0..ea46ea8b39195a 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -32,9 +32,12 @@
 from ...fluid.framework import OpProtoHolder
 from ...sysconfig import get_include, get_lib
 
-logging.basicConfig(
-    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
 logger = logging.getLogger("utils.cpp_extension")
+logger.setLevel(logging.INFO)
+formatter = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(message)s')
+ch = logging.StreamHandler()
+ch.setFormatter(formatter)
+logger.addHandler(ch)
 
 OS_NAME = sys.platform
 IS_WINDOWS = OS_NAME.startswith('win')
@@ -52,7 +55,7 @@
     '-dynamiclib', '-undefined', 'dynamic_lookup', '-arch', 'x86_64'
 ]
 
-MSVC_LINK_FLAGS = ['/MACHINE:X64', 'paddle_custom_op.lib']
+MSVC_LINK_FLAGS = ['/MACHINE:X64']
 
 COMMON_NVCC_FLAGS = ['-DPADDLE_WITH_CUDA', '-DEIGEN_USE_GPU']
 
@@ -368,10 +371,11 @@ def _get_core_name():
     Return pybind DSO module name.
     """
     import paddle
-    if paddle.fluid.core.load_noavx:
-        return 'core_noavx.so'
+    ext_name = '.pyd' if IS_WINDOWS else '.so'
+    if not paddle.fluid.core.load_noavx:
+        return 'core_avx' + ext_name
     else:
-        return 'core_avx.so'
+        return 'core_noavx' + ext_name
 
 
 def _get_lib_core_path():
@@ -383,6 +387,15 @@ def _get_lib_core_path():
     return os.path.join(_get_fluid_path(), lib_core_name)
 
 
+def _get_dll_core_path():
+    """
+    Return real path of libcore_(no)avx.dylib on Windows.
+    """
+    raw_core_name = _get_core_name()
+    dll_core_name = "paddle_pybind.dll"
+    return os.path.join(_get_fluid_path(), dll_core_name)
+
+
 def _reset_so_rpath(so_path):
     """
     NOTE(Aurelius84): Runtime path of core_(no)avx.so is modified into `@loader_path/../libs`
@@ -432,9 +445,12 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
         # append link flags
         extra_link_args = kwargs.get('extra_link_args', [])
         extra_link_args.extend(MSVC_LINK_FLAGS)
+        lib_core_name = create_sym_link_if_not_exist()
+        extra_link_args.append('{}'.format(lib_core_name))
         if use_cuda:
             extra_link_args.extend(['cudadevrt.lib', 'cudart_static.lib'])
         kwargs['extra_link_args'] = extra_link_args
+
     else:
         ########################### Linux Platform ###########################
         extra_link_args = kwargs.get('extra_link_args', [])
@@ -478,24 +494,41 @@ def create_sym_link_if_not_exist():
     """
     Create soft symbol link of `core_avx.so` or `core_noavx.so`
     """
-    assert OS_NAME.startswith('darwin')
+    assert OS_NAME.startswith('darwin') or IS_WINDOWS
 
     raw_core_name = _get_core_name()
     core_path = os.path.join(_get_fluid_path(), raw_core_name)
-    new_lib_core_path = _get_lib_core_path()
+    if IS_WINDOWS:
+        new_dll_core_path = _get_dll_core_path()
+        # create symbol link on windows
+        if not os.path.exists(new_dll_core_path):
+            try:
+                os.symlink(core_path, new_dll_core_path)
+            except Exception:
+                warnings.warn(
+                    "Failed to create soft symbol link for {}.\n You can run prompt as administrator and execute the "
+                    "following command manually: `mklink {} {}`. Now it will create hard link for {} trickly.".
+                    format(raw_core_name, new_dll_core_path, core_path,
+                           raw_core_name))
+                run_cmd('mklink /H {} {}'.format(new_dll_core_path, core_path))
+        # core_avx or core_noavx with lib suffix
+        assert os.path.exists(new_dll_core_path)
+        return raw_core_name[:-4] + ".lib"
 
-    # create symbol link
-    if not os.path.exists(new_lib_core_path):
-        try:
-            os.symlink(core_path, new_lib_core_path)
-            assert os.path.exists(new_lib_core_path)
-        except Exception:
-            raise RuntimeError(
-                "Failed to create soft symbol link for {}.\n Please execute the following command manually: `ln -s {} {}`".
-                format(raw_core_name, core_path, new_lib_core_path))
+    else:
+        new_lib_core_path = _get_lib_core_path()
+        # create symbol link on mac
+        if not os.path.exists(new_lib_core_path):
+            try:
+                os.symlink(core_path, new_lib_core_path)
+                assert os.path.exists(new_lib_core_path)
+            except Exception:
+                raise RuntimeError(
+                    "Failed to create soft symbol link for {}.\n Please execute the following command manually: `ln -s {} {}`".
+                    format(raw_core_name, core_path, new_lib_core_path))
 
-    # core_avx or core_noavx without suffix
-    return raw_core_name[:-3]
+        # core_avx or core_noavx without suffix
+        return raw_core_name[:-3]
 
 
 def find_cuda_home():
@@ -612,12 +645,34 @@ def find_paddle_includes(use_cuda=False):
 
     if OS_NAME.startswith('darwin'):
         # NOTE(Aurelius84): Ensure to find std v1 headers correctly.
-        std_v1_includes = '/Library/Developer/CommandLineTools/usr/include/c++/v1/'
-        include_dirs.append(std_v1_includes)
+        std_v1_includes = find_clang_cpp_include()
+        if std_v1_includes is not None and os.path.exists(std_v1_includes):
+            include_dirs.append(std_v1_includes)
 
     return include_dirs
 
 
+def find_clang_cpp_include(compiler='clang'):
+    std_v1_includes = None
+    try:
+        compiler_version = subprocess.check_output([compiler, "--version"])
+        if six.PY3:
+            compiler_version = compiler_version.decode()
+        infos = compiler_version.split("\n")
+        for info in infos:
+            if "InstalledDir" in info:
+                v1_path = info.split(':')[-1].strip()
+                if v1_path and os.path.exists(v1_path):
+                    std_v1_includes = os.path.join(
+                        os.path.dirname(v1_path), 'include/c++/v1')
+    except Exception:
+        # Just raise warnings because the include dir is not required.
+        warnings.warn(
+            "Failed to search `include/c++/v1/` include dirs. Don't worry because it's not required."
+        )
+    return std_v1_includes
+
+
 def find_cuda_libraries():
     """
     Use heuristic method to find cuda static lib path
@@ -1029,20 +1084,20 @@ def check_abi_compatibility(compiler, verbose=False):
     if os.environ.get('PADDLE_SKIP_CHECK_ABI') in ['True', 'true', '1']:
         return True
 
-    which = 'where' if IS_WINDOWS else 'which'
-    cmd_out = subprocess.check_output(
-        [which, compiler], stderr=subprocess.STDOUT)
-    compiler_path = os.path.realpath(cmd_out.decode()
-                                     if six.PY3 else cmd_out).strip()
-    # step 1. if not found any suitable compiler, raise error
-    if not any(name in compiler_path
-               for name in _expected_compiler_current_platform()):
-        warnings.warn(
-            WRONG_COMPILER_WARNING.format(
-                user_compiler=compiler,
-                paddle_compiler=_expected_compiler_current_platform()[0],
-                platform=OS_NAME))
-        return False
+    if not IS_WINDOWS:
+        cmd_out = subprocess.check_output(
+            ['which', compiler], stderr=subprocess.STDOUT)
+        compiler_path = os.path.realpath(cmd_out.decode()
+                                         if six.PY3 else cmd_out).strip()
+        # if not found any suitable compiler, raise warning
+        if not any(name in compiler_path
+                   for name in _expected_compiler_current_platform()):
+            warnings.warn(
+                WRONG_COMPILER_WARNING.format(
+                    user_compiler=compiler,
+                    paddle_compiler=_expected_compiler_current_platform()[0],
+                    platform=OS_NAME))
+            return False
 
     version = (0, 0, 0)
     # clang++ have no ABI compatibility problem
@@ -1103,4 +1158,4 @@ def log_v(info, verbose=True):
     Print log information on stdout.
     """
     if verbose:
-        logging.info(info)
+        logger.info(info)
diff --git a/python/paddle/utils/deprecated.py b/python/paddle/utils/deprecated.py
index daa2826ca360f1..e3839d9767d213 100755
--- a/python/paddle/utils/deprecated.py
+++ b/python/paddle/utils/deprecated.py
@@ -19,6 +19,8 @@
 import functools
 import paddle
 
+__all__ = []
+
 # NOTE(zhiqiu): Since python 3.2, DeprecationWarning is ignored by default,
 # and since python 3.7, it is once again shown by default when triggered directly by code in __main__.
 # See details: https://docs.python.org/3/library/warnings.html#default-warning-filter
@@ -28,7 +30,7 @@
 warnings.simplefilter('default', DeprecationWarning)
 
 
-def deprecated(update_to="", since="", reason=""):
+def deprecated(update_to="", since="", reason="", level=0):
     """Decorate a function to signify its deprecation.
 
        This function wraps a method that will soon be removed and does two things:
@@ -37,9 +39,14 @@ def deprecated(update_to="", since="", reason=""):
            - Raises a :class:`~exceptions.DeprecatedWarning` when old API is called.
 
        Args:
-           since(str): The version at which the decorated method is considered deprecated.
-           update_to(str): The new API users should use.
-           reason(str): The reason why the API is deprecated.
+            since(str, optional): The version at which the decorated method is considered deprecated.
+            update_to(str, optional): The new API users should use.
+            reason(str, optional): The reason why the API is deprecated.
+            level(int, optional): The deprecated warning log level. It must be 
+                an Integer and must be one of 0, 1, 2. 
+                If `level == 0`, the warning message will not be showed. 
+                If `level == 1`, the warning message will be showed normally.
+                If `level == 2`, it will raise `RuntimeError`.
            
        Returns:
            decorator: decorated function or class.
@@ -52,6 +59,9 @@ def decorator(func):
         assert isinstance(update_to, str), 'type of "update_to" must be str.'
         assert isinstance(since, str), 'type of "since" must be str.'
         assert isinstance(reason, str), 'type of "reason" must be str.'
+        assert isinstance(level, int) and level >= 0 and level < 3, (
+            'type of "level" must be int and must be one of 0, 1, 2. But '
+            'received: {}.'.format(level))
 
         _since = since.strip()
         _update_to = update_to.strip()
@@ -69,12 +79,12 @@ def decorator(func):
                 update_to)
             msg += ' Please use "{}" instead.'.format(_update_to)
         if len(_reason) > 0:
-            msg += "\n reason: {}".format(_reason)
+            msg += "\nreason: {}".format(_reason)
         if func.__doc__:
             func.__doc__ = ('\n\nWarning: ' + msg + '\n') + func.__doc__
-        # TODO(Joejiong) Early returning the wrapper function, currently we disable the warning wrapper, 
-        # because the 2.0beta APIs are still under development, we will restore the warning functionality when 2.0 rc APIs become stable.
-        return func
+
+        if level == 0:
+            return func
 
         @functools.wraps(func)
         def wrapper(*args, **kwargs):
@@ -83,13 +93,19 @@ def wrapper(*args, **kwargs):
                2. since version is empty, in this case, API is deprecated in all versions.
                3. current version is newer than since version.
             """
-            msg = "\033[93mWarning %s \033[0m" % (msg)
+
+            if level == 2:
+                raise RuntimeError('API "{}.{}" has been deprecated.'.format(
+                    func.__module__, func.__name__))
+
+            warningmsg = "\033[93m\nWarning:\n%s \033[0m" % (msg)
             v_current = [int(i) for i in paddle.__version__.split(".")]
             v_current += [0] * (4 - len(v_current))
             v_since = [int(i) for i in _since.split(".")]
             v_since += [0] * (4 - len(v_since))
             if paddle.__version__ == "0.0.0" or _since == "" or v_current >= v_since:
-                warnings.warn(msg, category=DeprecationWarning, stacklevel=2)
+                warnings.warn(
+                    warningmsg, category=DeprecationWarning, stacklevel=2)
 
             return func(*args, **kwargs)
 
diff --git a/python/paddle/utils/download.py b/python/paddle/utils/download.py
index b7d7d0b5adb543..ddd1dad9dbdf5a 100644
--- a/python/paddle/utils/download.py
+++ b/python/paddle/utils/download.py
@@ -55,7 +55,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 import logging
 logger = logging.getLogger(__name__)
 
-__all__ = ['get_weights_path_from_url']
+__all__ = []
 
 WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights")
 
@@ -117,7 +117,11 @@ def _get_unique_endpoints(trainer_endpoints):
     return unique_endpoints
 
 
-def get_path_from_url(url, root_dir, md5sum=None, check_exist=True):
+def get_path_from_url(url,
+                      root_dir,
+                      md5sum=None,
+                      check_exist=True,
+                      decompress=True):
     """ Download from given url to root_dir.
     if file or directory specified by url is exists under
     root_dir, return the path directly, otherwise download
@@ -152,7 +156,8 @@ def get_path_from_url(url, root_dir, md5sum=None, check_exist=True):
                 time.sleep(1)
 
     if ParallelEnv().current_endpoint in unique_endpoints:
-        if tarfile.is_tarfile(fullpath) or zipfile.is_zipfile(fullpath):
+        if decompress and (tarfile.is_tarfile(fullpath) or
+                           zipfile.is_zipfile(fullpath)):
             fullpath = _decompress(fullpath)
 
     return fullpath
diff --git a/python/paddle/utils/image_util.py b/python/paddle/utils/image_util.py
index b113f574e9fac0..18be9366c40a7c 100644
--- a/python/paddle/utils/image_util.py
+++ b/python/paddle/utils/image_util.py
@@ -16,6 +16,8 @@
 from PIL import Image
 from six.moves import cStringIO as StringIO
 
+__all__ = []
+
 
 def resize_image(img, target_size):
     """
diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py
index 3b98680c89f25e..69baa4facfa96c 100644
--- a/python/paddle/utils/install_check.py
+++ b/python/paddle/utils/install_check.py
@@ -20,7 +20,7 @@
 
 import paddle
 
-__all__ = ['run_check']
+__all__ = []
 
 
 def _simple_network():
@@ -74,6 +74,34 @@ def _is_cuda_available():
         return False
 
 
+def _run_dygraph_single(use_cuda):
+    """
+    Testing the simple network in dygraph mode using one CPU/GPU.
+
+    Args:
+        use_cuda (bool): Whether running with CUDA.
+    """
+    paddle.disable_static()
+    if use_cuda:
+        paddle.set_device('gpu')
+    else:
+        paddle.set_device('cpu')
+    weight_attr = paddle.ParamAttr(
+        name="weight", initializer=paddle.nn.initializer.Constant(value=0.5))
+    bias_attr = paddle.ParamAttr(
+        name="bias", initializer=paddle.nn.initializer.Constant(value=1.0))
+    linear = paddle.nn.Linear(
+        2, 4, weight_attr=weight_attr, bias_attr=bias_attr)
+    input_np = _prepare_data(1)
+    input_tensor = paddle.to_tensor(input_np)
+    linear_out = linear(input_tensor)
+    out = paddle.tensor.sum(linear_out)
+    out.backward()
+    opt = paddle.optimizer.Adam(
+        learning_rate=0.001, parameters=linear.parameters())
+    opt.step()
+
+
 def _run_static_single(use_cuda):
     """
     Testing the simple network with executor running directly, using one CPU/GPU.
@@ -152,7 +180,11 @@ def run_check():
 
     print("Running verify PaddlePaddle program ... ")
 
-    use_cuda = _is_cuda_available()
+    if paddle.is_compiled_with_cuda():
+        use_cuda = _is_cuda_available()
+    else:
+        use_cuda = False
+
     if use_cuda:
         device_str = "GPU"
         device_list = paddle.static.cuda_places()
@@ -162,6 +194,7 @@ def run_check():
     device_count = len(device_list)
 
     _run_static_single(use_cuda)
+    _run_dygraph_single(use_cuda)
     print("PaddlePaddle works well on 1 {}.".format(device_str))
 
     try:
diff --git a/python/paddle/utils/lazy_import.py b/python/paddle/utils/lazy_import.py
index ea07077b2da2a9..d9146422819f8a 100644
--- a/python/paddle/utils/lazy_import.py
+++ b/python/paddle/utils/lazy_import.py
@@ -15,6 +15,8 @@
 
 import importlib
 
+__all__ = []
+
 
 def try_import(module_name):
     """Try importing a module, with an informative error message on failure."""
diff --git a/python/paddle/utils/op_version.py b/python/paddle/utils/op_version.py
index 68acc9de081518..6e81b5a2c17bb1 100644
--- a/python/paddle/utils/op_version.py
+++ b/python/paddle/utils/op_version.py
@@ -14,7 +14,7 @@
 
 from ..fluid import core
 
-__all__ = ['OpLastCheckpointChecker']
+__all__ = []
 
 
 def Singleton(cls):
diff --git a/python/paddle/utils/profiler.py b/python/paddle/utils/profiler.py
index 89c0d2cac68a97..cc33342ec5a51b 100644
--- a/python/paddle/utils/profiler.py
+++ b/python/paddle/utils/profiler.py
@@ -18,9 +18,22 @@
 import warnings
 
 from ..fluid import core
-from ..fluid.profiler import *
-
-__all__ = ['ProfilerOptions', 'Profiler', 'get_profiler']
+from ..fluid.profiler import cuda_profiler  # noqa: F401
+from ..fluid.profiler import start_profiler
+from ..fluid.profiler import profiler  # noqa: F401
+from ..fluid.profiler import stop_profiler
+from ..fluid.profiler import reset_profiler
+
+__all__ = [     #noqa
+           'Profiler',
+           'get_profiler',
+           'ProfilerOptions',
+           'cuda_profiler',
+           'start_profiler',
+           'profiler',
+           'stop_profiler',
+           'reset_profiler'
+]
 
 
 class ProfilerOptions(object):
diff --git a/python/paddle/vision/datasets/folder.py b/python/paddle/vision/datasets/folder.py
index 06a55b718087e8..718af041307a15 100644
--- a/python/paddle/vision/datasets/folder.py
+++ b/python/paddle/vision/datasets/folder.py
@@ -28,11 +28,14 @@ def has_valid_extension(filename, extensions):
 
     Args:
         filename (str): path to a file
-        extensions (tuple of str): extensions to consider (lowercase)
+        extensions (list[str]|tuple[str]): extensions to consider
 
     Returns:
         bool: True if the filename ends with one of given extensions
     """
+    assert isinstance(extensions,
+                      (list, tuple)), ("`extensions` must be list or tuple.")
+    extensions = tuple([x.lower() for x in extensions])
     return filename.lower().endswith(extensions)
 
 
@@ -73,7 +76,7 @@ class DatasetFolder(Dataset):
     Args:
         root (string): Root directory path.
         loader (callable|optional): A function to load a sample given its path.
-        extensions (tuple[str]|optional): A list of allowed extensions.
+        extensions (list[str]|tuple[str]|optional): A list of allowed extensions.
             both extensions and is_valid_file should not be passed.
         transform (callable|optional): A function/transform that takes in
             a sample and returns a transformed version.
@@ -226,7 +229,7 @@ class ImageFolder(Dataset):
     Args:
         root (string): Root directory path.
         loader (callable, optional): A function to load a sample given its path.
-        extensions (tuple[string], optional): A list of allowed extensions.
+        extensions (list[str]|tuple[str], optional): A list of allowed extensions.
             both extensions and is_valid_file should not be passed.
         transform (callable, optional): A function/transform that takes in
             a sample and returns a transformed version.
diff --git a/python/paddle/vision/image.py b/python/paddle/vision/image.py
index 3d5ea3a73af6cb..19986816b7cc42 100644
--- a/python/paddle/vision/image.py
+++ b/python/paddle/vision/image.py
@@ -80,9 +80,9 @@ def make_fake_dir():
             shutil.rmtree(temp_dir)
     """
     global _image_backend
-    if backend not in ['pil', 'cv2']:
+    if backend not in ['pil', 'cv2', 'tensor']:
         raise ValueError(
-            "Expected backend are one of ['pil', 'cv2'], but got {}"
+            "Expected backend are one of ['pil', 'cv2', 'tensor'], but got {}"
             .format(backend))
     _image_backend = backend
 
@@ -150,13 +150,13 @@ def image_load(path, backend=None):
 
     if backend is None:
         backend = _image_backend
-    if backend not in ['pil', 'cv2']:
+    if backend not in ['pil', 'cv2', 'tensor']:
         raise ValueError(
-            "Expected backend are one of ['pil', 'cv2'], but got {}"
+            "Expected backend are one of ['pil', 'cv2', 'tensor'], but got {}"
             .format(backend))
 
     if backend == 'pil':
         return Image.open(path)
-    else:
+    elif backend == 'cv2':
         cv2 = try_import('cv2')
         return cv2.imread(path)
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 079aa086f2b3be..60a7a90c9be895 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -22,7 +22,10 @@
 
 from paddle.common_ops_import import *
 
-__all__ = ['yolo_loss', 'yolo_box', 'deform_conv2d', 'DeformConv2D']
+__all__ = [
+    'yolo_loss', 'yolo_box', 'deform_conv2d', 'DeformConv2D', 'read_file',
+    'decode_jpeg'
+]
 
 
 def yolo_loss(x,
@@ -336,7 +339,7 @@ def yolo_box(x,
         import paddle
         import numpy as np
 
-	x = np.random.random([2, 14, 8, 8]).astype('float32')
+        x = np.random.random([2, 14, 8, 8]).astype('float32')
         img_size = np.ones((2, 2)).astype('int32')
 
         x = paddle.to_tensor(x)
@@ -454,13 +457,13 @@ def deform_conv2d(x,
             the number of output channels, g is the number of groups, kH is the filter's
             height, kW is the filter's width.
         bias (Tensor, optional): The bias with shape [M,].
-        stride (int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        stride (int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain two integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride. Default: stride = 1.
-        padding (int|list|tuple, optional): The padding size. If padding is a tuple, it must
+        padding (int|list|tuple, optional): The padding size. If padding is a list/tuple, it must
             contain two integers, (padding_H, padding_W). Otherwise, the
             padding_H = padding_W = padding. Default: padding = 0.
-        dilation (int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation (int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: dilation = 1.
         deformable_groups (int): The number of deformable group partitions.
@@ -644,13 +647,13 @@ class DeformConv2D(Layer):
         in_channels(int): The number of input channels in the input image.
         out_channels(int): The number of output channels produced by the convolution.
         kernel_size(int|list|tuple): The size of the convolving kernel.
-        stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
+        stride(int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain three integers, (stride_H, stride_W). Otherwise, the
             stride_H = stride_W = stride. The default value is 1.
-        padding (int|list|tuple, optional): The padding size. If padding is a tuple, it must
+        padding (int|list|tuple, optional): The padding size. If padding is a list/tuple, it must
             contain two integers, (padding_H, padding_W). Otherwise, the
             padding_H = padding_W = padding. Default: padding = 0.
-        dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
+        dilation(int|list|tuple, optional): The dilation size. If dilation is a list/tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
         deformable_groups (int): The number of deformable group partitions.
@@ -782,3 +785,95 @@ def forward(self, x, offset, mask=None):
             groups=self._groups,
             mask=mask)
         return out
+
+
+def read_file(filename, name=None):
+    """
+    Reads and outputs the bytes contents of a file as a uint8 Tensor
+    with one dimension.
+
+    Args:
+        filename (str): Path of the file to be read.
+        name (str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A uint8 tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import cv2
+            import paddle
+
+            fake_img = (np.random.random(
+                        (400, 300, 3)) * 255).astype('uint8')
+
+            cv2.imwrite('fake.jpg', fake_img)
+
+            img_bytes = paddle.vision.ops.read_file('fake.jpg')
+            
+            print(img_bytes.shape)
+
+    """
+
+    if in_dygraph_mode():
+        return core.ops.read_file('filename', filename)
+
+    inputs = dict()
+    attrs = {'filename': filename}
+
+    helper = LayerHelper("read_file", **locals())
+    out = helper.create_variable_for_type_inference('uint8')
+    helper.append_op(
+        type="read_file", inputs=inputs, attrs=attrs, outputs={"Out": out})
+
+    return out
+
+
+def decode_jpeg(x, mode='unchanged', name=None):
+    """
+    Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. 
+    Optionally converts the image to the desired format. 
+    The values of the output tensor are uint8 between 0 and 255.
+
+    Args:
+        x (Tensor): A one dimensional uint8 tensor containing the raw bytes 
+            of the JPEG image.
+        mode (str): The read mode used for optionally converting the image. 
+            Default: 'unchanged'.
+        name (str, optional): The default value is None. Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name`.
+    Returns:
+        Tensor: A decoded image tensor with shape (imge_channels, image_height, image_width)
+
+    Examples:
+        .. code-block:: python
+            import cv2
+            import paddle
+
+            fake_img = (np.random.random(
+                        (400, 300, 3)) * 255).astype('uint8')
+
+            cv2.imwrite('fake.jpg', fake_img)
+
+            img_bytes = paddle.vision.ops.read_file('fake.jpg')
+            img = paddle.vision.ops.decode_jpeg(img_bytes)
+
+            print(img.shape)
+    """
+
+    if in_dygraph_mode():
+        return core.ops.decode_jpeg(x, "mode", mode)
+
+    inputs = {'X': x}
+    attrs = {"mode": mode}
+
+    helper = LayerHelper("decode_jpeg", **locals())
+    out = helper.create_variable_for_type_inference('uint8')
+    helper.append_op(
+        type="decode_jpeg", inputs=inputs, attrs=attrs, outputs={"Out": out})
+
+    return out
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index da90e4907e410a..18a35915c99da5 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -25,13 +25,6 @@
 from numpy import sin, cos, tan
 import paddle
 
-if sys.version_info < (3, 3):
-    Sequence = collections.Sequence
-    Iterable = collections.Iterable
-else:
-    Sequence = collections.abc.Sequence
-    Iterable = collections.abc.Iterable
-
 from . import functional_pil as F_pil
 from . import functional_cv2 as F_cv2
 from . import functional_tensor as F_t
@@ -83,14 +76,18 @@ def to_tensor(pic, data_format='CHW'):
             print(tensor.shape)
 
     """
-    if not (_is_pil_image(pic) or _is_numpy_image(pic)):
-        raise TypeError('pic should be PIL Image or ndarray. Got {}'.format(
-            type(pic)))
+    if not (_is_pil_image(pic) or _is_numpy_image(pic) or
+            _is_tensor_image(pic)):
+        raise TypeError(
+            'pic should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(pic)))
 
     if _is_pil_image(pic):
         return F_pil.to_tensor(pic, data_format)
-    else:
+    elif _is_numpy_image(pic):
         return F_cv2.to_tensor(pic, data_format)
+    else:
+        return pic if data_format.lower() == 'chw' else pic.transpose((1, 2, 0))
 
 
 def resize(img, size, interpolation='bilinear'):
@@ -135,13 +132,16 @@ def resize(img, size, interpolation='bilinear'):
             converted_img = F.resize(fake_img, (200, 150))
             print(converted_img.size)
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.resize(img, size, interpolation)
+    elif _is_tensor_image(img):
+        return F_t.resize(img, size, interpolation)
     else:
         return F_cv2.resize(img, size, interpolation)
 
@@ -153,8 +153,8 @@ def pad(img, padding, fill=0, padding_mode='constant'):
     Args:
         img (PIL.Image|np.array): Image to be padded.
         padding (int|list|tuple): Padding on each border. If a single int is provided this
-            is used to pad all borders. If tuple of length 2 is provided this is the padding
-            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            is used to pad all borders. If list/tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a list/tuple of length 4 is provided
             this is the padding for the left, top, right and bottom borders
             respectively.
         fill (float, optional): Pixel fill value for constant fill. If a tuple of
@@ -196,13 +196,16 @@ def pad(img, padding, fill=0, padding_mode='constant'):
             padded_img = F.pad(fake_img, padding=(2, 1))
             print(padded_img.size)
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.pad(img, padding, fill, padding_mode)
+    elif _is_tensor_image(img):
+        return F_t.pad(img, padding, fill, padding_mode)
     else:
         return F_cv2.pad(img, padding, fill, padding_mode)
 
@@ -236,13 +239,16 @@ def crop(img, top, left, height, width):
             print(cropped_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.crop(img, top, left, height, width)
+    elif _is_tensor_image(img):
+        return F_t.crop(img, top, left, height, width)
     else:
         return F_cv2.crop(img, top, left, height, width)
 
@@ -272,13 +278,16 @@ def center_crop(img, output_size):
             cropped_img = F.center_crop(fake_img, (150, 100))
             print(cropped_img.size)
         """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.center_crop(img, output_size)
+    elif _is_tensor_image(img):
+        return F_t.center_crop(img, output_size)
     else:
         return F_cv2.center_crop(img, output_size)
 
@@ -307,13 +316,16 @@ def hflip(img):
             print(flpped_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.hflip(img)
+    elif _is_tensor_image(img):
+        return F_t.hflip(img)
     else:
         return F_cv2.hflip(img)
 
@@ -342,13 +354,16 @@ def vflip(img):
             print(flpped_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.vflip(img)
+    elif _is_tensor_image(img):
+        return F_t.vflip(img)
     else:
         return F_cv2.vflip(img)
 
@@ -538,10 +553,10 @@ def rotate(img,
             If true, expands the output image to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
             Note that the expand flag assumes rotation around the center and no translation.
-        center (2-tuple, optional): Optional center of rotation.
+        center (2-list|2-tuple, optional): Optional center of rotation.
             Origin is the upper left corner.
             Default is the center of the image.
-        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+        fill (3-list|3-tuple or int): RGB pixel fill value for area outside the rotated image.
             If int, it is used for all channels respectively.
 
 
@@ -563,13 +578,21 @@ def rotate(img,
             print(rotated_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
+    if isinstance(center, list):
+        center = tuple(center)
+    if isinstance(fill, list):
+        fill = tuple(fill)
+
     if _is_pil_image(img):
         return F_pil.rotate(img, angle, interpolation, expand, center, fill)
+    elif _is_tensor_image(img):
+        return F_t.rotate(img, angle, interpolation, expand, center, fill)
     else:
         return F_cv2.rotate(img, angle, interpolation, expand, center, fill)
 
@@ -601,13 +624,16 @@ def to_grayscale(img, num_output_channels=1):
             print(gray_img.size)
 
     """
-    if not (_is_pil_image(img) or _is_numpy_image(img)):
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
         raise TypeError(
-            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
             format(type(img)))
 
     if _is_pil_image(img):
         return F_pil.to_grayscale(img, num_output_channels)
+    elif _is_tensor_image(img):
+        return F_t.to_grayscale(img, num_output_channels)
     else:
         return F_cv2.to_grayscale(img, num_output_channels)
 
diff --git a/python/paddle/vision/transforms/functional_cv2.py b/python/paddle/vision/transforms/functional_cv2.py
index d50ba7b23c74a5..99cbfd6dc4f8dd 100644
--- a/python/paddle/vision/transforms/functional_cv2.py
+++ b/python/paddle/vision/transforms/functional_cv2.py
@@ -136,8 +136,8 @@ def pad(img, padding, fill=0, padding_mode='constant'):
     Args:
         img (np.array): Image to be padded.
         padding (int|list|tuple): Padding on each border. If a single int is provided this
-            is used to pad all borders. If tuple of length 2 is provided this is the padding
-            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            is used to pad all borders. If list/tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a list/tuple of length 4 is provided
             this is the padding for the left, top, right and bottom borders
             respectively.
         fill (float, optional): Pixel fill value for constant fill. If a tuple of
diff --git a/python/paddle/vision/transforms/functional_pil.py b/python/paddle/vision/transforms/functional_pil.py
index 516c28f849915c..eee60c5452b2de 100644
--- a/python/paddle/vision/transforms/functional_pil.py
+++ b/python/paddle/vision/transforms/functional_pil.py
@@ -141,8 +141,8 @@ def pad(img, padding, fill=0, padding_mode='constant'):
     Args:
         img (PIL.Image): Image to be padded.
         padding (int|list|tuple): Padding on each border. If a single int is provided this
-            is used to pad all borders. If tuple of length 2 is provided this is the padding
-            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            is used to pad all borders. If list/tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a list/tuple of length 4 is provided
             this is the padding for the left, top, right and bottom borders
             respectively.
         fill (float, optional): Pixel fill value for constant fill. If a tuple of
diff --git a/python/paddle/vision/transforms/functional_tensor.py b/python/paddle/vision/transforms/functional_tensor.py
index e8b70820dd9af6..7f490d57916fbc 100644
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
@@ -14,11 +14,78 @@
 
 from __future__ import division
 
+import math
+import numbers
+
 import paddle
+import paddle.nn.functional as F
+
+import sys
+import collections
+
+
+def _assert_image_tensor(img, data_format):
+    if not isinstance(
+            img, paddle.Tensor) or img.ndim != 3 or not data_format.lower() in (
+                'chw', 'hwc'):
+        raise RuntimeError(
+            'not support [type={}, ndim={}, data_format={}] paddle image'.
+            format(type(img), img.ndim, data_format))
+
+
+def _get_image_h_axis(data_format):
+    if data_format.lower() == 'chw':
+        return -2
+    elif data_format.lower() == 'hwc':
+        return -3
+
+
+def _get_image_w_axis(data_format):
+    if data_format.lower() == 'chw':
+        return -1
+    elif data_format.lower() == 'hwc':
+        return -2
+
+
+def _get_image_c_axis(data_format):
+    if data_format.lower() == 'chw':
+        return -3
+    elif data_format.lower() == 'hwc':
+        return -1
+
+
+def _get_image_n_axis(data_format):
+    if len(data_format) == 3:
+        return None
+    elif len(data_format) == 4:
+        return 0
+
+
+def _is_channel_last(data_format):
+    return _get_image_c_axis(data_format) == -1
+
+
+def _is_channel_first(data_format):
+    return _get_image_c_axis(data_format) == -3
+
+
+def _get_image_num_batches(img, data_format):
+    if _get_image_n_axis(data_format):
+        return img.shape[_get_image_n_axis(data_format)]
+    return None
+
+
+def _get_image_num_channels(img, data_format):
+    return img.shape[_get_image_c_axis(data_format)]
+
+
+def _get_image_size(img, data_format):
+    return img.shape[_get_image_w_axis(data_format)], img.shape[
+        _get_image_h_axis(data_format)]
 
 
 def normalize(img, mean, std, data_format='CHW'):
-    """Normalizes a tensor image with mean and standard deviation.
+    """Normalizes a tensor image given mean and standard deviation.
 
     Args:
         img (paddle.Tensor): input data to be normalized.
@@ -31,10 +98,417 @@ def normalize(img, mean, std, data_format='CHW'):
         Tensor: Normalized mage.
 
     """
-    if data_format == 'CHW':
-        mean = paddle.to_tensor(mean).reshape([-1, 1, 1])
-        std = paddle.to_tensor(std).reshape([-1, 1, 1])
-    else:
-        mean = paddle.to_tensor(mean)
-        std = paddle.to_tensor(std)
+    _assert_image_tensor(img, data_format)
+
+    mean = paddle.to_tensor(mean, place=img.place)
+    std = paddle.to_tensor(std, place=img.place)
+
+    if _is_channel_first(data_format):
+        mean = mean.reshape([-1, 1, 1])
+        std = std.reshape([-1, 1, 1])
+
     return (img - mean) / std
+
+
+def to_grayscale(img, num_output_channels=1, data_format='CHW'):
+    """Converts image to grayscale version of image.
+
+    Args:
+        img (paddel.Tensor): Image to be converted to grayscale.
+        num_output_channels (int, optionl[1, 3]):
+            if num_output_channels = 1 : returned image is single channel
+            if num_output_channels = 3 : returned image is 3 channel 
+        data_format (str, optional): Data format of img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+
+    Returns:
+        paddle.Tensor: Grayscale version of the image.
+    """
+    _assert_image_tensor(img, data_format)
+
+    if num_output_channels not in (1, 3):
+        raise ValueError('num_output_channels should be either 1 or 3')
+
+    rgb_weights = paddle.to_tensor(
+        [0.2989, 0.5870, 0.1140], place=img.place).astype(img.dtype)
+
+    if _is_channel_first(data_format):
+        rgb_weights = rgb_weights.reshape((-1, 1, 1))
+
+    _c_index = _get_image_c_axis(data_format)
+
+    img = (img * rgb_weights).sum(axis=_c_index, keepdim=True)
+    _shape = img.shape
+    _shape[_c_index] = num_output_channels
+
+    return img.expand(_shape)
+
+
+def _affine_grid(theta, w, h, ow, oh):
+    d = 0.5
+    base_grid = paddle.ones((1, oh, ow, 3), dtype=theta.dtype)
+
+    x_grid = paddle.linspace(-ow * 0.5 + d, ow * 0.5 + d - 1, ow)
+    base_grid[..., 0] = x_grid
+    y_grid = paddle.linspace(-oh * 0.5 + d, oh * 0.5 + d - 1, oh).unsqueeze_(-1)
+    base_grid[..., 1] = y_grid
+
+    scaled_theta = theta.transpose(
+        (0, 2, 1)) / paddle.to_tensor([0.5 * w, 0.5 * h])
+    output_grid = base_grid.reshape((1, oh * ow, 3)).bmm(scaled_theta)
+
+    return output_grid.reshape((1, oh, ow, 2))
+
+
+def _grid_transform(img, grid, mode, fill):
+    if img.shape[0] > 1:
+        grid = grid.expand(img.shape[0], grid.shape[1], grid.shape[2],
+                           grid.shape[3])
+
+    if fill is not None:
+        dummy = paddle.ones(
+            (img.shape[0], 1, img.shape[2], img.shape[3]), dtype=img.dtype)
+        img = paddle.concat((img, dummy), axis=1)
+
+    img = F.grid_sample(
+        img, grid, mode=mode, padding_mode="zeros", align_corners=False)
+
+    # Fill with required color
+    if fill is not None:
+        mask = img[:, -1:, :, :]  # n 1 h w
+        img = img[:, :-1, :, :]  # n c h w
+        mask = mask.expand_as(img)
+        len_fill = len(fill) if isinstance(fill, (tuple, list)) else 1
+        fill_img = paddle.to_tensor(fill).reshape(
+            (1, len_fill, 1, 1)).expand_as(img)
+
+        if mode == 'nearest':
+            mask = paddle.cast(mask < 0.5, img.dtype)
+            img = img * (1. - mask) + mask * fill_img
+        else:  # 'bilinear'
+            img = img * mask + (1.0 - mask) * fill_img
+
+    return img
+
+
+def rotate(img,
+           angle,
+           interpolation='nearest',
+           expand=False,
+           center=None,
+           fill=None,
+           data_format='CHW'):
+    """Rotates the image by angle.
+
+    Args:
+        img (paddle.Tensor): Image to be rotated.
+        angle (float or int): In degrees degrees counter clockwise order.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set NEAREST . when use pil backend, 
+            support method are as following: 
+            - "nearest" 
+            - "bilinear"
+            - "bicubic"
+        expand (bool, optional): Optional expansion flag.
+            If true, expands the output image to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center and no translation.
+        center (2-tuple, optional): Optional center of rotation.
+            Origin is the upper left corner.
+            Default is the center of the image.
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+
+    Returns:
+        paddle.Tensor: Rotated image.
+
+    """
+
+    angle = -angle % 360
+    img = img.unsqueeze(0)
+
+    # n, c, h, w = img.shape
+    w, h = _get_image_size(img, data_format=data_format)
+
+    img = img if data_format.lower() == 'chw' else img.transpose((0, 3, 1, 2))
+
+    post_trans = [0, 0]
+
+    if center is None:
+        rotn_center = [0, 0]
+    else:
+        rotn_center = [(p - s * 0.5) for p, s in zip(center, [w, h])]
+
+    angle = math.radians(angle)
+    matrix = [
+        math.cos(angle),
+        math.sin(angle),
+        0.0,
+        -math.sin(angle),
+        math.cos(angle),
+        0.0,
+    ]
+
+    matrix[2] += matrix[0] * (-rotn_center[0] - post_trans[0]) + matrix[1] * (
+        -rotn_center[1] - post_trans[1])
+    matrix[5] += matrix[3] * (-rotn_center[0] - post_trans[0]) + matrix[4] * (
+        -rotn_center[1] - post_trans[1])
+
+    matrix[2] += rotn_center[0]
+    matrix[5] += rotn_center[1]
+
+    matrix = paddle.to_tensor(matrix, place=img.place)
+    matrix = matrix.reshape((1, 2, 3))
+
+    if expand:
+        # calculate output size
+        corners = paddle.to_tensor(
+            [[-0.5 * w, -0.5 * h, 1.0], [-0.5 * w, 0.5 * h, 1.0],
+             [0.5 * w, 0.5 * h, 1.0], [0.5 * w, -0.5 * h, 1.0]],
+            place=matrix.place).astype(matrix.dtype)
+
+        _pos = corners.reshape(
+            (1, -1, 3)).bmm(matrix.transpose((0, 2, 1))).reshape((1, -1, 2))
+        _min = _pos.min(axis=-2).floor()
+        _max = _pos.max(axis=-2).ceil()
+
+        npos = _max - _min
+        nw = npos[0][0]
+        nh = npos[0][1]
+
+        ow, oh = int(nw.numpy()[0]), int(nh.numpy()[0])
+
+    else:
+        ow, oh = w, h
+
+    grid = _affine_grid(matrix, w, h, ow, oh)
+
+    out = _grid_transform(img, grid, mode=interpolation, fill=fill)
+
+    out = out if data_format.lower() == 'chw' else out.transpose((0, 2, 3, 1))
+
+    return out.squeeze(0)
+
+
+def vflip(img, data_format='CHW'):
+    """Vertically flips the given paddle tensor.
+
+    Args:
+        img (paddle.Tensor): Image to be flipped.
+        data_format (str, optional): Data format of img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+
+    Returns:
+        paddle.Tensor:  Vertically flipped image.
+
+    """
+    _assert_image_tensor(img, data_format)
+
+    h_axis = _get_image_h_axis(data_format)
+
+    return img.flip(axis=[h_axis])
+
+
+def hflip(img, data_format='CHW'):
+    """Horizontally flips the given paddle.Tensor Image.
+
+    Args:
+        img (paddle.Tensor): Image to be flipped.
+        data_format (str, optional): Data format of img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+
+    Returns:
+        paddle.Tensor:  Horizontall flipped image.
+
+    """
+    _assert_image_tensor(img, data_format)
+
+    w_axis = _get_image_w_axis(data_format)
+
+    return img.flip(axis=[w_axis])
+
+
+def crop(img, top, left, height, width, data_format='CHW'):
+    """Crops the given paddle.Tensor Image.
+
+    Args:
+        img (paddle.Tensor): Image to be cropped. (0,0) denotes the top left 
+            corner of the image.
+        top (int): Vertical component of the top left corner of the crop box.
+        left (int): Horizontal component of the top left corner of the crop box.
+        height (int): Height of the crop box.
+        width (int): Width of the crop box.
+        data_format (str, optional): Data format of img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+    Returns:
+        paddle.Tensor: Cropped image.
+
+    """
+    _assert_image_tensor(img, data_format)
+
+    if _is_channel_first(data_format):
+        return img[:, top:top + height, left:left + width]
+    else:
+        return img[top:top + height, left:left + width, :]
+
+
+def center_crop(img, output_size, data_format='CHW'):
+    """Crops the given paddle.Tensor Image and resize it to desired size.
+
+        Args:
+            img (paddle.Tensor): Image to be cropped. (0,0) denotes the top left corner of the image.
+            output_size (sequence or int): (height, width) of the crop box. If int,
+                it is used for both directions   
+            data_format (str, optional): Data format of img, should be 'HWC' or 
+                'CHW'. Default: 'CHW'.     
+        Returns:
+            paddle.Tensor: Cropped image.
+
+        """
+    _assert_image_tensor(img, data_format)
+
+    if isinstance(output_size, numbers.Number):
+        output_size = (int(output_size), int(output_size))
+
+    image_width, image_height = _get_image_size(img, data_format)
+    crop_height, crop_width = output_size
+    crop_top = int(round((image_height - crop_height) / 2.))
+    crop_left = int(round((image_width - crop_width) / 2.))
+    return crop(
+        img,
+        crop_top,
+        crop_left,
+        crop_height,
+        crop_width,
+        data_format=data_format)
+
+
+def pad(img, padding, fill=0, padding_mode='constant', data_format='CHW'):
+    """
+    Pads the given paddle.Tensor on all sides with specified padding mode and fill value.
+
+    Args:
+        img (paddle.Tensor): Image to be padded.
+        padding (int|list|tuple): Padding on each border. If a single int is provided this
+            is used to pad all borders. If tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            this is the padding for the left, top, right and bottom borders
+            respectively.
+        fill (float, optional): Pixel fill value for constant fill. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant. Default: 0. 
+        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default: 'constant'.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value on the edge of the image
+
+            - reflect: pads with reflection of image (without repeating the last value on the edge)
+
+                       padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+                       will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image (repeating the last value on the edge)
+
+                         padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+                         will result in [2, 1, 1, 2, 3, 4, 4, 3]
+
+    Returns:
+        paddle.Tensor: Padded image.
+
+    """
+    _assert_image_tensor(img, data_format)
+
+    if not isinstance(padding, (numbers.Number, list, tuple)):
+        raise TypeError('Got inappropriate padding arg')
+    if not isinstance(fill, (numbers.Number, str, list, tuple)):
+        raise TypeError('Got inappropriate fill arg')
+    if not isinstance(padding_mode, str):
+        raise TypeError('Got inappropriate padding_mode arg')
+
+    if isinstance(padding, (list, tuple)) and len(padding) not in [2, 4]:
+        raise ValueError(
+            "Padding must be an int or a 2, or 4 element tuple, not a " +
+            "{} element tuple".format(len(padding)))
+
+    assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'], \
+        'Padding mode should be either constant, edge, reflect or symmetric'
+
+    if isinstance(padding, int):
+        pad_left = pad_right = pad_top = pad_bottom = padding
+    elif len(padding) == 2:
+        pad_left = pad_right = padding[0]
+        pad_top = pad_bottom = padding[1]
+    else:
+        pad_left = padding[0]
+        pad_top = padding[1]
+        pad_right = padding[2]
+        pad_bottom = padding[3]
+
+    padding = [pad_left, pad_right, pad_top, pad_bottom]
+
+    if padding_mode == 'edge':
+        padding_mode = 'replicate'
+    elif padding_mode == 'symmetric':
+        raise ValueError('Do not support symmetric mdoe')
+
+    img = img.unsqueeze(0)
+    #  'constant', 'reflect', 'replicate', 'circular'
+    img = F.pad(img,
+                pad=padding,
+                mode=padding_mode,
+                value=float(fill),
+                data_format='N' + data_format)
+
+    return img.squeeze(0)
+
+
+def resize(img, size, interpolation='bilinear', data_format='CHW'):
+    """
+    Resizes the image to given size
+
+    Args:
+        input (paddle.Tensor): Image to be resized.
+        size (int|list|tuple): Target size of input data, with (height, width) shape.
+        interpolation (int|str, optional): Interpolation method. when use paddle backend, 
+            support method are as following: 
+            - "nearest"  
+            - "bilinear"
+            - "bicubic"
+            - "trilinear"
+            - "area"
+            - "linear"
+        data_format (str, optional): paddle.Tensor format
+            - 'CHW'
+            - 'HWC'
+    Returns:
+        paddle.Tensor: Resized image.
+
+    """
+    _assert_image_tensor(img, data_format)
+
+    if not (isinstance(size, int) or
+            (isinstance(size, (tuple, list)) and len(size) == 2)):
+        raise TypeError('Got inappropriate size arg: {}'.format(size))
+
+    if isinstance(size, int):
+        w, h = _get_image_size(img, data_format)
+        if (w <= h and w == size) or (h <= w and h == size):
+            return img
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+    else:
+        oh, ow = size
+
+    img = img.unsqueeze(0)
+    img = F.interpolate(
+        img,
+        size=(oh, ow),
+        mode=interpolation.lower(),
+        data_format='N' + data_format.upper())
+
+    return img.squeeze(0)
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 7d3d5f525c2c75..00e12689c4d9fe 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -49,6 +49,8 @@ def _get_image_size(img):
         return img.size
     elif F._is_numpy_image(img):
         return img.shape[:2][::-1]
+    elif F._is_tensor_image(img):
+        return img.shape[1:][::-1]  # chw
     else:
         raise TypeError("Unexpected type {}".format(type(img)))
 
@@ -86,7 +88,7 @@ class Compose(object):
     together for a dataset transform.
 
     Args:
-        transforms (list): List of transforms to compose.
+        transforms (list|tuple): List/Tuple of transforms to compose.
 
     Returns:
         A compose object which is callable, __call__ for this Compose
@@ -104,7 +106,7 @@ class Compose(object):
 
             for i in range(10):
                 sample = flowers[i]
-                print(sample[0].shape, sample[1])
+                print(sample[0].size, sample[1])
 
     """
 
@@ -608,8 +610,8 @@ class Normalize(BaseTransform):
     ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
 
     Args:
-        mean (int|float|list): Sequence of means for each channel.
-        std (int|float|list): Sequence of standard deviations for each channel.
+        mean (int|float|list|tuple): Sequence of means for each channel.
+        std (int|float|list|tuple): Sequence of standard deviations for each channel.
         data_format (str, optional): Data format of img, should be 'HWC' or 
             'CHW'. Default: 'CHW'.
         to_rgb (bool, optional): Whether to convert to rgb. Default: False.
@@ -690,6 +692,9 @@ def __init__(self, order=(2, 0, 1), keys=None):
         self.order = order
 
     def _apply_image(self, img):
+        if F._is_tensor_image(img):
+            return img.transpose(self.order)
+
         if F._is_pil_image(img):
             img = np.asarray(img)
 
@@ -1022,11 +1027,11 @@ class Pad(BaseTransform):
 
     Args:
         padding (int|list|tuple): Padding on each border. If a single int is provided this
-            is used to pad all borders. If tuple of length 2 is provided this is the padding
-            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            is used to pad all borders. If list/tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a list/tuple of length 4 is provided
             this is the padding for the left, top, right and bottom borders
             respectively.
-        fill (int|list|tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
+        fill (int|list|tuple): Pixel fill value for constant fill. Default is 0. If a list/tuple of
             length 3, it is used to fill R, G, B channels respectively.
             This value is only used when the padding_mode is constant
         padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant.
diff --git a/python/setup.py.in b/python/setup.py.in
index c366415ebb21a1..0f2e97192c1df1 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -145,11 +145,13 @@ packages=['paddle',
           'paddle.distributed',
           'paddle.incubate',
           'paddle.incubate.optimizer',
+          'paddle.incubate.checkpoint',
           'paddle.distributed.fleet',
           'paddle.distributed.fleet.base',
           'paddle.distributed.fleet.meta_optimizers',
           'paddle.distributed.fleet.meta_optimizers.sharding',
           'paddle.distributed.fleet.meta_optimizers.ascend',
+          'paddle.distributed.fleet.meta_optimizers.dygraph_optimizer',
           'paddle.distributed.fleet.runtime',
           'paddle.distributed.fleet.dataset',
           'paddle.distributed.fleet.data_generator',
@@ -157,7 +159,8 @@ packages=['paddle',
           'paddle.distributed.fleet.proto',
           'paddle.distributed.fleet.utils',
           'paddle.distributed.fleet.meta_parallel',
-          'paddle.distributed.fleet.meta_parallel.mp_utils',
+          'paddle.distributed.fleet.meta_parallel.pp_utils',
+          'paddle.distributed.fleet.meta_parallel.parallel_layers',
           'paddle.framework',
           'paddle.jit',
           'paddle.jit.dy2static',
@@ -175,11 +178,9 @@ packages=['paddle',
           'paddle.fluid.contrib',
           'paddle.fluid.contrib.decoder',
           'paddle.fluid.contrib.quantize',
-          'paddle.fluid.contrib.reader',
           'paddle.fluid.contrib.slim',
           'paddle.fluid.contrib.slim.quantization',
           'paddle.fluid.contrib.slim.quantization.imperative',
-          'paddle.fluid.contrib.utils',
           'paddle.fluid.contrib.extend_optimizer',
           'paddle.fluid.contrib.mixed_precision',
           'paddle.fluid.contrib.mixed_precision.bf16',
@@ -254,11 +255,15 @@ paddle_bins = ''
 
 if not '${WIN32}':
     paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
-package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + ('.so' if os.name != 'nt' else '.pyd')]}
+
+if os.name != 'nt':
+    package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + '.so']}
+else:
+    package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + '.pyd', '${FLUID_CORE_NAME}' + '.lib']}
+
 if '${HAS_NOAVX_CORE}' == 'ON':
     package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')]
 
-
 package_dir={
     '': '${PADDLE_BINARY_DIR}/python',
     # The paddle.fluid.proto will be generated while compiling.
@@ -352,14 +357,6 @@ if '${WITH_XPU}' == 'OFF' and '${XPU_SDK_ROOT}' != '':
         package_data['paddle.libs']+=['libxpurt.so']
 
 
-### New custom op extension mechanism related ###
-
-# copy paddle_custom_op.lib/paddle_custom_op.dll to libs on Windows
-if os.name == 'nt':
-    shutil.copy('${PADDLE_CUSTOM_OP_IMPORT_LIB}', libs_path)
-    shutil.copy('${PADDLE_CUSTOM_OP_SHARED_LIB}', libs_path)
-    package_data['paddle.libs'] += ['paddle_custom_op.lib', 'paddle_custom_op.dll']
-
 # remove unused paddle/libs/__init__.py
 if os.path.isfile(libs_path+'/__init__.py'):
     os.remove(libs_path+'/__init__.py')
@@ -393,15 +390,22 @@ if os.name == 'nt':
 elif sys.platform == 'darwin':
     ext_modules = []
 
-def find_files(pattern, root):
+def find_files(pattern, root, recursive=False):
     for dirpath, _, files in os.walk(root):
-      for filename in fnmatch.filter(files, pattern):
-          yield os.path.join(dirpath, filename)
+        for filename in fnmatch.filter(files, pattern):
+            yield os.path.join(dirpath, filename)
+        if not recursive:
+            break
 
 headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension')) +  # extension
-    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost')))  # boost
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension/include')) +  # extension
+    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost', True)) + # boost
+    # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
+    # to `extension/incude`,
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex64.h'] +
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex128.h'] +
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'])
 
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
@@ -441,35 +445,18 @@ class InstallHeaders(Command):
                                    ('install_headers', 'install_dir'),
                                    ('force', 'force'))
 
-    def copy_data_type_headers(self):
-        # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
-        # to `extension/incude`,
-        data_type_headers = (['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex64.h'] + 
-                             ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex128.h'] + 
-                             ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'])
-
-        install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include")
-        if not os.path.exists(install_dir):
-            self.mkpath(install_dir)
-        for header in data_type_headers:
-            self.copy_file(header, install_dir)
-
     def mkdir_and_copy_file(self, header):
         if 'pb.h' in header:
             install_dir = re.sub('${PADDLE_BINARY_DIR}/', '', header)
         elif 'third_party' not in header:
             # paddle headers
             install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
+            if 'fluid' in install_dir:
+                install_dir = "paddle/extension/include/"
         else:
             # third_party
             install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
-            patterns = ['eigen3/src/extern_eigen3', 'boost/src/extern_boost',
-                       'dlpack/src/extern_dlpack/include',
-                       'install/protobuf/include',
-                       'install/gflags/include',
-                       'install/glog/include', 'install/xxhash/include',
-                       'install/mkldnn/include',
-                       'threadpool/src/extern_threadpool']
+            patterns = ['boost/src/extern_boost', 'install/mkldnn/include']
             for pattern in patterns:
                 install_dir = re.sub(pattern, '', install_dir)
         install_dir = os.path.join(self.install_dir, os.path.dirname(install_dir))
@@ -485,7 +472,6 @@ class InstallHeaders(Command):
         for header in hdrs:
             (out, _) = self.mkdir_and_copy_file(header)
             self.outfiles.append(out)
-        self.copy_data_type_headers()
 
     def get_inputs(self):
         return self.distribution.headers or []
@@ -517,6 +503,12 @@ else:
     with open("@PADDLE_BINARY_DIR@/python/paddle/README.rst", "r")as f:
         long_description = unicode(f.read(), 'UTF-8')
 
+# strip *.so to reduce package size
+if '${WITH_STRIP}' == 'ON':
+    command = 'find ${PADDLE_BINARY_DIR}/python/paddle -name "*.so" | xargs -i strip {}'
+    if os.system(command) != 0:
+        raise Exception("strip *.so failed, command: %s" % command)
+
 with redirect_stdout():
     setup(name='${PACKAGE_NAME}',
         version='${PADDLE_VERSION}',
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index 5a59935887bbe4..752f3545c69ccc 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -10,3 +10,4 @@ scipy>=0.19.0, <=1.2.1 ; python_version<"3.5"
 scipy<=1.3.1 ; python_version=="3.5"
 scipy ; python_version>"3.5"
 prettytable
+distro
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 05466883e58d28..b1395c28878e3a 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -84,6 +84,7 @@ function run_test_sampcd_processor() {
     CUR_PWD=$(pwd)
     cd ${PADDLE_ROOT}/tools
     python test_sampcd_processor.py
+    python test_print_signatures.py
     cd ${CUR_PWD}
 }
 
diff --git a/tools/count_api_without_core_ops.py b/tools/count_api_without_core_ops.py
index 99e84074158ad7..664b94a059f5c8 100644
--- a/tools/count_api_without_core_ops.py
+++ b/tools/count_api_without_core_ops.py
@@ -22,6 +22,7 @@
 import hashlib
 import six
 import functools
+import platform
 
 __all__ = ['get_apis_with_and_without_core_ops', ]
 
@@ -34,9 +35,20 @@
 
 
 def md5(doc):
-    hash = hashlib.md5()
-    hash.update(str(doc).encode('utf-8'))
-    return hash.hexdigest()
+    try:
+        hashinst = hashlib.md5()
+        if platform.python_version()[0] == "2":
+            hashinst.update(str(doc))
+        else:
+            hashinst.update(str(doc).encode('utf-8'))
+        md5sum = hashinst.hexdigest()
+    except UnicodeDecodeError as e:
+        md5sum = None
+        print(
+            "Error({}) occurred when `md5({})`, discard it.".format(
+                str(e), doc),
+            file=sys.stderr)
+    return md5sum
 
 
 def split_with_and_without_core_ops(member, cur_name):
diff --git a/tools/coverage/gcda_clean.py b/tools/coverage/gcda_clean.py
index 39fa3509cb86eb..12bd04a6907ea9 100644
--- a/tools/coverage/gcda_clean.py
+++ b/tools/coverage/gcda_clean.py
@@ -18,6 +18,7 @@
 
 import os
 import sys
+import time
 
 from github import Github
 
@@ -33,7 +34,18 @@ def get_pull(pull_id):
     """
     token = os.getenv('GITHUB_API_TOKEN')
     github = Github(token, timeout=60)
-    repo = github.get_repo('PaddlePaddle/Paddle')
+    idx = 1
+    while idx < 4:
+        try:
+            repo = github.get_repo('PaddlePaddle/Paddle')
+        except Exception as e:
+            print(e)
+            print("get_repo error, retry {} times after {} secs.".format(
+                idx, idx * 10))
+        else:
+            break
+        idx += 1
+        time.sleep(idx * 10)
     pull = repo.get_pull(pull_id)
 
     return pull
diff --git a/tools/coverage/paddle_coverage.sh b/tools/coverage/paddle_coverage.sh
index fd28d939bd1cc7..7fb32040e795ca 100644
--- a/tools/coverage/paddle_coverage.sh
+++ b/tools/coverage/paddle_coverage.sh
@@ -33,7 +33,7 @@ make install
 
 cd /paddle/build
 
-python3.7 ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID}
+python3.7 ${PADDLE_ROOT}/tools/coverage/gcda_clean.py ${GIT_PR_ID} || exit 101
 
 lcov --capture -d ./ -o coverage.info --rc lcov_branch_coverage=0
 
diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos
index 5e87804179f078..813781b5e79cec 100644
--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
@@ -50,17 +50,20 @@ RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/re
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install setuptools -U && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install setuptools -U
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install setuptools -U && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install setuptools -U
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install -r /root/requirements.txt && \
     go get github.com/Masterminds/glide && \
     rm -rf /root/requirements.txt
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' 
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install pre-commit 'ipython==5.3.0' 
 
 RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index 2cae7896d6483e..78a8b14027900c 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -65,6 +65,12 @@ RUN wget -q https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz && \
     CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
     make -j8 > /dev/null && make altinstall > /dev/null && ldconfig
 
+# Install Python3.9
+RUN wget -q https://www.python.org/ftp/python/3.9.0/Python-3.9.0.tgz && \
+    tar -xzf Python-3.9.0.tgz && cd Python-3.9.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null && ldconfig
+
 ENV PATH=/usr/local/python3.7.0/include:${PATH}
 ENV PATH=/usr/local/python3.7.0/bin:${PATH}
 ENV LD_LIBRARY_PATH=/usr/local/python3.7.0/lib:${LD_LIBRARY_PATH}
@@ -92,16 +98,28 @@ RUN wget https://files.pythonhosted.org/packages/b0/d1/8acb42f391cba52e35b131e44
 WORKDIR /home/setuptools-40.6.2
 RUN python setup.py build && python setup.py install
 WORKDIR /home
+RUN python3.9 -m pip uninstall -y pip setuptools && \
+  python3.8 -m pip uninstall -y pip setuptools && \
+  python3.7 -m pip uninstall -y pip setuptools && \
+  python3.6 -m pip uninstall -y pip setuptools
+RUN wget https://files.pythonhosted.org/packages/a7/e0/30642b9c2df516506d40b563b0cbd080c49c6b3f11a70b4c7a670f13a78b/setuptools-50.3.2.zip && apt-get -y install unzip && unzip setuptools-50.3.2.zip
+WORKDIR /home/setuptools-50.3.2
+RUN python3.9 setup.py build && python3.9 setup.py install && \
+  python3.8 setup.py build && python3.8 setup.py install && \
+  python3.7 setup.py build && python3.7 setup.py install && \
+  python3.6 setup.py build && python3.6 setup.py install
+WORKDIR /home
 RUN wget https://files.pythonhosted.org/packages/28/af/2c76c8aa46ccdf7578b83d97a11a2d1858794d4be4a1610ade0d30182e8b/pip-20.0.1.tar.gz && tar -zxvf pip-20.0.1.tar.gz
 WORKDIR pip-20.0.1
 RUN python setup.py install && \
+  python3.9 setup.py install && \
   python3.8 setup.py install && \
   python3.7 setup.py install && \
   python3.6 setup.py install
 
 WORKDIR /home
-RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-20.0.1.tar.gz && \
-    rm -r Python-$version setuptools-40.6.2 pip-20.0.1
+RUN rm Python-$version.tgz setuptools-40.6.2.zip setuptools-50.3.2.zip pip-20.0.1.tar.gz && \
+    rm -r Python-$version setuptools-40.6.2 setuptools-50.3.2 pip-20.0.1
 
 # Install Go and glide
 WORKDIR /home
@@ -147,6 +165,9 @@ RUN pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
     pip3.8 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
     pip3.8 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
     pip3.8 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.9 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
+    pip3.9 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.9 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
     pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
 
@@ -156,6 +177,8 @@ RUN pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip3.8 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.8 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.9 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.9 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0'
 
@@ -163,28 +186,33 @@ RUN pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
 RUN pip3.6 --no-cache-dir install pylint pytest astroid isort && \
     pip3.7 --no-cache-dir install pylint pytest astroid isort && \
     pip3.8 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.9 --no-cache-dir install pylint pytest astroid isort && \
     pip --no-cache-dir install pylint pytest astroid isort
 
 RUN pip3.6 --no-cache-dir install coverage && \
     pip3.7 --no-cache-dir install coverage && \
     pip3.8 --no-cache-dir install coverage && \
+    pip3.9 --no-cache-dir install coverage && \
     pip --no-cache-dir install coverage
 
 COPY ./python/requirements.txt /root/
 RUN pip3.6 --no-cache-dir install -r /root/requirements.txt && \
     pip3.7 --no-cache-dir install -r /root/requirements.txt && \
     pip3.8 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.9 --no-cache-dir install -r /root/requirements.txt && \
     pip --no-cache-dir install -r /root/requirements.txt
 
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
 RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y && \
-    pip3.6 install --upgrade pip && \ 
+    pip3.6 install --upgrade pip==20.3.3 && \ 
     pip3.7 install --upgrade pip && \ 
     pip3.8 install --upgrade pip && \ 
+    pip3.9 install --upgrade pip && \ 
     pip3.6 --no-cache-dir install certifi urllib3[secure] && \
     pip3.7 --no-cache-dir install certifi urllib3[secure] && \
     pip3.8 --no-cache-dir install certifi urllib3[secure] && \
+    pip3.9 --no-cache-dir install certifi urllib3[secure] && \
     pip --no-cache-dir install certifi urllib3[secure]
 
 # ar mishandles 4GB files
diff --git a/tools/dockerfile/Dockerfile.ubuntu18 b/tools/dockerfile/Dockerfile.ubuntu18
index d6c4753e746751..a4a445e6db214d 100644
--- a/tools/dockerfile/Dockerfile.ubuntu18
+++ b/tools/dockerfile/Dockerfile.ubuntu18
@@ -11,7 +11,7 @@ ARG WITH_AVX
 ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV DEBIAN_FRONTEND=noninteractive
-ENV LD_LIBRARY_PATH=/usr/local/cuda-11.0/targets/x86_64-linux/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda-11.2/targets/x86_64-linux/lib:$LD_LIBRARY_PATH
 
 ENV HOME /root
 # Add bash enhancements
@@ -36,27 +36,48 @@ RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc
 RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ 
 ENV PATH=/usr/local/gcc-8.2/bin:$PATH 
 
+
+# install cmake
+WORKDIR /home
+RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz
+ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
+
+
 RUN apt-get update && \
   apt-get install -y python2.7 python2.7-dev \
-  python3.5 python3.5-dev \
   python3.6 python3.6-dev \
   python3.7 python3.7-dev \
-  python3.8 python3.8-dev python3.8-distutils && \
-  curl https://bootstrap.pypa.io/2.7/get-pip.py -o - | python2.7 && easy_install pip && \
-  curl https://bootstrap.pypa.io/3.5/get-pip.py -o - | python3.5 && easy_install pip && \
-  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.6 && easy_install pip && \
-  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.7 && easy_install pip && \
-  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.8 && easy_install pip && \
+  python3.8 python3.8-dev python3.8-distutils \
+  python3.9 python3.9-dev python3.9-distutils && \
   rm /usr/bin/python && ln -s /usr/bin/python2.7 /usr/bin/python && \
-  rm /usr/bin/python3 && ln -s /usr/bin/python3.5 /usr/bin/python3 && \
-  rm /usr/local/bin/pip && ln -s /usr/local/bin/pip2.7 /usr/local/bin/pip && \
-  rm /usr/local/bin/pip3 && ln -s /usr/local/bin/pip3.5 /usr/local/bin/pip3
+  rm /usr/bin/python3 && ln -s /usr/bin/python3.7 /usr/bin/python3
 
 
-# install cmake
 WORKDIR /home
-RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz
-ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
+RUN wget https://files.pythonhosted.org/packages/a7/e0/30642b9c2df516506d40b563b0cbd080c49c6b3f11a70b4c7a670f13a78b/setuptools-50.3.2.zip && apt-get -y install unzip && unzip setuptools-50.3.2.zip
+WORKDIR /home/setuptools-50.3.2
+RUN python3.9 setup.py build && python3.9 setup.py install && \
+  python3.8 setup.py build && python3.8 setup.py install && \
+  python3.7 setup.py build && python3.7 setup.py install && \
+  python3.6 setup.py build && python3.6 setup.py install
+WORKDIR /home
+RUN wget https://files.pythonhosted.org/packages/b0/d1/8acb42f391cba52e35b131e442e80deffbb8d0676b93261d761b1f0ef8fb/setuptools-40.6.2.zip && apt-get -y install unzip && unzip setuptools-40.6.2.zip
+WORKDIR /home/setuptools-40.6.2
+RUN python setup.py build && python setup.py install
+WORKDIR /home
+RUN wget https://files.pythonhosted.org/packages/28/af/2c76c8aa46ccdf7578b83d97a11a2d1858794d4be4a1610ade0d30182e8b/pip-20.0.1.tar.gz && tar -zxvf pip-20.0.1.tar.gz
+WORKDIR pip-20.0.1
+RUN python setup.py install && \
+  python3.9 setup.py install && \
+  python3.8 setup.py install && \
+  python3.7 setup.py install && \
+  python3.6 setup.py install
+
+WORKDIR /home
+RUN rm setuptools-40.6.2.zip setuptools-50.3.2.zip pip-20.0.1.tar.gz && \
+    rm -r setuptools-40.6.2 setuptools-50.3.2 pip-20.0.1
+RUN rm /usr/local/bin/pip && ln -s /usr/local/bin/pip2.7 /usr/local/bin/pip && \
+  rm /usr/local/bin/pip3 && ln -s /usr/local/bin/pip3.7 /usr/local/bin/pip3
 
 
 # remove them when apt-get support 2.27 and higher version
@@ -84,29 +105,29 @@ RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 
-RUN pip3 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
-    pip3 --no-cache-dir install ipykernel==4.6.0 wheel && \
-    pip3.6 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+RUN pip3.6 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
     pip3.6 --no-cache-dir install ipykernel==4.6.0 wheel && \
     pip3.7 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
     pip3.7 --no-cache-dir install ipykernel==4.6.0 wheel && \
     pip3.8 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
     pip3.8 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.9 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3.9 --no-cache-dir install ipykernel==4.6.0 wheel && \
     pip --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
     pip --no-cache-dir install ipykernel==4.6.0 wheel 
 
 #For docstring checker
-RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
-    pip3.6 --no-cache-dir install pylint pytest astroid isort && \
+RUN pip3.6 --no-cache-dir install pylint pytest astroid isort && \
     pip3.7 --no-cache-dir install pylint pytest astroid isort && \
     pip3.8 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.9 --no-cache-dir install pylint pytest astroid isort && \
     pip --no-cache-dir install pylint pytest astroid isort
 
 COPY ./python/requirements.txt /root/
-RUN pip3 --no-cache-dir install -r /root/requirements.txt && \
-    pip3.6 --no-cache-dir install -r /root/requirements.txt && \
+RUN pip3.6 --no-cache-dir install -r /root/requirements.txt && \
     pip3.7 --no-cache-dir install -r /root/requirements.txt && \
     pip3.8 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.9 --no-cache-dir install -r /root/requirements.txt && \
     pip --no-cache-dir install -r /root/requirements.txt
 
 
diff --git a/tools/dockerfile/build_scripts/build.sh b/tools/dockerfile/build_scripts/build.sh
index 41f6e18f547cce..393bd045fb7f81 100644
--- a/tools/dockerfile/build_scripts/build.sh
+++ b/tools/dockerfile/build_scripts/build.sh
@@ -24,7 +24,7 @@ set -ex
 # remove others to expedite build and reduce docker image size. The original
 # manylinux docker image project builds many python versions.
 # NOTE We added back 3.5.1, since auditwheel requires python 3.3+
-CPYTHON_VERSIONS="3.8.0 3.7.0 3.6.0"
+CPYTHON_VERSIONS="3.9.0 3.8.0 3.7.0 3.6.0"
 
 # openssl version to build, with expected sha256 hash of .tar.gz
 # archive
@@ -80,11 +80,12 @@ build_cpythons $CPYTHON_VERSIONS
 PY36_BIN=/opt/python/cp36-cp36m/bin
 PY37_BIN=/opt/python/cp37-cp37m/bin
 PY38_BIN=/opt/python/cp38-cp38m/bin
+PY39_BIN=/opt/python/cp39-cp39m/bin
 # NOTE Since our custom manylinux image builds pythons with shared
 # libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
 # python.
 ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY36_BIN})/lib:$(dirname ${PY37_BIN})/lib:$(dirname ${PY38_BIN})/lib"
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY36_BIN})/lib:$(dirname ${PY37_BIN})/lib:$(dirname ${PY38_BIN})/lib:$(dirname ${PY39_BIN})/lib"
 
 # Our openssl doesn't know how to find the system CA trust store
 #   (https://github.com/pypa/manylinux/issues/53)
diff --git a/tools/dockerfile/build_scripts/build_utils.sh b/tools/dockerfile/build_scripts/build_utils.sh
index d3098686594c0d..8f4f88328aa44f 100755
--- a/tools/dockerfile/build_scripts/build_utils.sh
+++ b/tools/dockerfile/build_scripts/build_utils.sh
@@ -93,8 +93,8 @@ function do_cpython_build {
     rm -rf Python-$py_ver
     # Some python's install as bin/python3. Make them available as
     # bin/python.
-    if [ -e ${prefix}/bin/python3 ]; then
-        ln -s python3 ${prefix}/bin/python
+    if [ -e ${prefix}/bin/python3.6 ]; then
+        ln -s python3.6 ${prefix}/bin/python
     fi
     if [ -e ${prefix}/bin/python3.7 ]; then
         ln -s python3.7 ${prefix}/bin/python
@@ -102,8 +102,17 @@ function do_cpython_build {
     if [ -e ${prefix}/bin/python3.8 ]; then
         ln -s python3.8 ${prefix}/bin/python
     fi
+    if [ -e ${prefix}/bin/python3.9 ]; then
+        ln -s python3.9 ${prefix}/bin/python
+    fi
     # NOTE Make libpython shared library visible to python calls below
-    LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python get-pip.py
+    if [ -e ${prefix}/bin/python3.6 ]; then
+        LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python ez_setup.py
+        LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python -m easy_install pip
+        LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python -m pip install --upgrade pip==20.3.3
+    else
+        LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python get-pip.py
+    fi
     LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/pip install wheel==0.32.2
     cd /
     ls ${MY_DIR}
@@ -134,6 +143,8 @@ function build_cpythons {
             GET_PIP_URL="https://bootstrap.pypa.io/2.7/get-pip.py"
         elif [ ${py_ver} == "3.5.1" ]  ;then
             GET_PIP_URL="https://bootstrap.pypa.io/3.5/get-pip.py"
+        elif [ ${py_ver} == "3.6.0" ]  ;then
+            GET_PIP_URL="https://bootstrap.pypa.io/ez_setup.py"
         fi
 
         check_var $GET_PIP_URL
@@ -141,6 +152,7 @@ function build_cpythons {
         build_cpython $py_ver
     done
     rm get-pip.py
+    rm ez_setup.py
 }
 
 
diff --git a/tools/dockerfile/build_scripts/install_nccl2.sh b/tools/dockerfile/build_scripts/install_nccl2.sh
index b06b3d44c6ec6b..07f186f3d4e8cd 100644
--- a/tools/dockerfile/build_scripts/install_nccl2.sh
+++ b/tools/dockerfile/build_scripts/install_nccl2.sh
@@ -17,7 +17,7 @@
 VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
 if [ "$VERSION" == "10.0" ]; then
   DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
-elif [ "$VERSION" == "10.2" ] || [ "$VERSION" == "10.1" ] || [ "$VERSION" == "11.0" ]; then
+elif [ "$VERSION" == "10.2" ] || [ "$VERSION" == "10.1" ] || [ "$VERSION" == "11.0" ] || [ "$VERSION" == "11.2" ]; then
   if [ -f "/etc/redhat-release" ];then
     rm -f /usr/local/lib/libnccl.so 
     wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-2.7.8-1+cuda10.2.x86_64.rpm
diff --git a/tools/dockerfile/build_scripts/install_trt.sh b/tools/dockerfile/build_scripts/install_trt.sh
index e5ec70d2f378d1..1df8d0f4568fbb 100644
--- a/tools/dockerfile/build_scripts/install_trt.sh
+++ b/tools/dockerfile/build_scripts/install_trt.sh
@@ -21,6 +21,11 @@ if [[ "$VERSION" == "10.1" ]];then
   tar -zxf TensorRT6-cuda10.1-cudnn7.tar.gz -C /usr/local
   cp -rf /usr/local/TensorRT6-cuda10.1-cudnn7/include/* /usr/include/ && cp -rf /usr/local/TensorRT6-cuda10.1-cudnn7/lib/* /usr/lib/
   rm TensorRT6-cuda10.1-cudnn7.tar.gz
+elif [[ "$VERSION" == "11.2" ]];then
+  wget -q https://paddle-ci.gz.bcebos.com/TRT/TensorRT7-cuda11.1-cudnn8.1.tar.gz --no-check-certificate
+  tar -zxf TensorRT7-cuda11.1-cudnn8.1.tar.gz -C /usr/local
+  cp -rf /usr/local/TensorRT-7.2.3.4/include/* /usr/include/ && cp -rf /usr/local/TensorRT-7.2.3.4/lib/* /usr/lib/
+  rm TensorRT7-cuda11.1-cudnn8.1.tar.gz
 elif [[ "$VERSION" == "11.0" ]];then
   wget -q https://paddle-ci.cdn.bcebos.com/TRT/TensorRT-7.1.3.4.Ubuntu-16.04.x86_64-gnu.cuda-11.0.cudnn8.0.tar.gz --no-check-certificate
   tar -zxf TensorRT-7.1.3.4.Ubuntu-16.04.x86_64-gnu.cuda-11.0.cudnn8.0.tar.gz -C /usr/local
diff --git a/tools/dockerfile/centos7_manylinux.sh b/tools/dockerfile/centos7_manylinux.sh
index 5f8a48c8067a5b..6ea2a8f836fc0f 100755
--- a/tools/dockerfile/centos7_manylinux.sh
+++ b/tools/dockerfile/centos7_manylinux.sh
@@ -34,7 +34,7 @@ function make_cuda10cudnn7() {
 
 function make_cuda101cudnn7() {
   sed 's/<baseimg>/10.1-cudnn7-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
-  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp 
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp 
 }
 
 function make_cuda102cudnn7() {
@@ -52,6 +52,11 @@ function make_cuda11cudnn8() {
   sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp
 }
 
+function make_cuda112cudnn8() {
+  sed 's/<baseimg>/11.2.1-cudnn8-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp
+}
+
 function main() {
   local CMD=$1 
   case $CMD in
@@ -73,6 +78,9 @@ function main() {
     cuda11cudnn8)
       make_cuda11cudnn8
      ;;
+    cuda112cudnn8)
+      make_cuda112cudnn8
+     ;;
     *)
       echo "Make dockerfile error, Without this paramet."
       exit 1
diff --git a/tools/dockerfile/ubuntu16_dev.sh b/tools/dockerfile/ubuntu16_dev.sh
index 23578b4143f8b1..0de9f82aceec66 100755
--- a/tools/dockerfile/ubuntu16_dev.sh
+++ b/tools/dockerfile/ubuntu16_dev.sh
@@ -40,6 +40,8 @@ function ref_whl(){
 
   if [[ ${ref_CUDA_MAJOR} == "11.0" ]];then
       ref_version=.post110
+  elif [[ ${ref_CUDA_MAJOR} == "11.2" ]];then
+      ref_version=.post112
   elif [[ ${ref_CUDA_MAJOR} == "10" ]];then
       ref_version=.post100
   elif [[ ${ref_CUDA_MAJOR} == "10.1" ]];then
diff --git a/tools/dockerfile/ubuntu18_dev.sh b/tools/dockerfile/ubuntu18_dev.sh
index 6c6a14529ca0e1..c72243ef0521e3 100755
--- a/tools/dockerfile/ubuntu18_dev.sh
+++ b/tools/dockerfile/ubuntu18_dev.sh
@@ -40,6 +40,8 @@ function ref_whl(){
 
   if [[ ${ref_CUDA_MAJOR} == "11.0" ]];then
       ref_version=.post110
+  elif [[ ${ref_CUDA_MAJOR} == "11.2" ]];then
+      ref_version=.post112
   elif [[ ${ref_CUDA_MAJOR} == "10" ]];then
       ref_version=.post100
   elif [[ ${ref_CUDA_MAJOR} == "10.1" ]];then
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 0f745f212078fc..9d03ae22de28f2 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -425,7 +425,6 @@
     'cpu_info_test',
     'cpu_helper_test',
     'cow_ptr_tests',
-    'convert_model2dot_ernie',
     'conditional_block_op_test',
     'cipher_utils_test',
     'check_reduce_rank_test',
@@ -437,9 +436,172 @@
     'assign_op_test',
     'allocator_facade_frac_flags_test',
     'aes_cipher_test',
+    'test_dist_sparse_tensor_load_adagrad',
+    'test_dist_mnist_fp16_allreduce',
+    'test_dist_mnist_gradient_merge',
+    'test_dist_allreduce_op',
+    'test_hdfs3',
+    'test_parallel_dygraph_se_resnext',
+    'test_dist_fleet_ps9',
+    'test_dist_fleet_infer',
+    'test_dist_se_resnext_sync',
+    'test_dist_oneps',
+    'test_dist_sparse_load_ps1',
+    'test_dist_mnist_batch_merge',
+    'test_dist_fleet_ctr',
+    'test_dist_fleet_ps10',
+    'test_parallel_dygraph_transformer',
+    'test_dist_mnist_fleetapi',
+    'test_dist_sparse_tensor_load_adam',
+    'test_dist_fleet_ps4',
+    'test_dist_fleet_heter_program',
+    'test_parallel_dygraph_sparse_embedding_over_height',
+    'test_hdfs2',
+    'test_dist_sharding_save',
+    'test_dist_fleet_ps_gpu_ctr',
+    'test_dist_mnist_backward_deps',
+    'test_dist_fleet_heter_base',
+    'test_dist_sparse_tensor_load_sgd',
+    'test_new_group',
+    'test_dist_mnist_with_program',
+    'test_dist_mnist_pg',
+    'test_dist_sparse_tensor_load_rmsprop',
+    'test_auto_checkpoint2',
+    'test_dist_sparse_tensor_load_ftrl',
+    'test_dist_fleet_ps6',
+    'test_dist_mnist_fleet_save',
+    'test_auto_checkpoint1',
+    'test_dist_fleet_a_sync_optimizer_sync',
+    'test_dist_fleet_ps3',
+    'test_dist_se_resnext_nccl',
+    'test_parallel_dygraph_mnist',
+    'test_auto_checkpoint_multiple',
+    'test_dist_fleet_a_sync_optimizer_auto_async',
+    'test_pipeline',
+    'test_dist_fleet_ps8',
+    'test_dist_fleet_sparse_embedding_ctr',
+    'test_dist_se_resnext_dgc',
+    'test_dist_fleet_ps7',
+    'test_dist_fleet_decay',
+    'test_dist_fleet_a_sync_optimizer_auto_geo',
+    'test_dist_fleet_geo',
+    'test_parallel_dygraph_dataparallel',
+    'test_hdfs1',
+    'test_dist_mnist_dgc_nccl',
+    'test_dist_fleet_ctr2',
+    'test_parallel_dygraph_unused_variables',
+    'test_dist_mnist_multi_comm',
+    'test_dist_sparse_tensor_load_momentum',
+    'test_gen_nccl_id_op',
+    'test_parallel_dygraph_sparse_embedding',
+    'test_dist_mnist_ring_allreduce',
+    'test_fleet_launch_async',
+    'test_dist_fleet_a_sync_optimizer_geo',
+    'test_parallel_dygraph_control_flow',
+    'test_auto_checkpoint',
+    'test_fleet_pipeline_meta_optimizer',
+    'test_dist_fleet_heter_ctr',
+    'test_fleet_graph_execution_meta_optimizer',
+    'test_fleet_run_random_port',
+    'test_dist_fleet_ps5',
+    'test_dist_fleet_a_sync_optimizer_auto',
+    'test_dist_lookup_sparse_table_fuse_ops',
+    'test_dist_fleet_a_sync_optimizer_async',
+    'test_c_comm_init_op',
+    'test_fleet_launch_nproc',
+    'test_dist_fleet_simnet',
+    'test_auto_checkpoint_dist_basic',
+    'test_fleet_launch_cloud',
+    'test_dist_fleet_ps',
+    'test_dist_op',
+    'test_dist_sparse_load_ps0',
+    'test_auto_checkpoint3',
+    'test_dist_fleet_ps2',
+    'test_dist_fleet_grad_clip',
+    'test_custom_concat',
+    'test_analyzer_transformer_fuse',
+    'test_analyzer_seq_pool1_fuse_statis',
+    'test_fc_lstm_fuse_pass_cc',
+    'test_layer_norm_fuse_pass',
+    'test_fc_gru_fuse_pass_cc',
+    'test_analyzer_save_model',
+    'test_fleet_ps',
+    'test_analyzer_multi_model_prediction',
+    'test_fleet_base_3',
+    'test_fleet_base_2',
+    'test_ascend_trigger',
+    'test_fleet_amp_meta_optimizer',
+    'test_fleetrun',
+    'test_check_abi',
+    'dense_table_test',
+    'test_adaptive_pool2d_convert_global_pass',
+    'test_fleet_recompute_meta_optimizer',
+    'test_fleet_fp16_allreduce_meta_optimizer',
+    'test_post_training_quantization_lstm_model',
+    'test_fleet_metric',
+    'test_fleet_gradient_merge_meta_optimizer',
+    'test_fleet_sharding_meta_optimizer',
+    'test_listen_and_serv_op',
+    'test_analyzer_zerocopytensor_tensor',
+    'test_conv_bn_fuse_pass_cc',
+    'test_collective_optimizer',
+    'test_bf16_utils',
+    'test_analyzer_seq_pool1_compare_determine',
+    'test_avoid_twice_initialization',
+    'test_callback_early_stop',
+    'test_fleet_distributed_strategy',
+    'test_launch_coverage',
+    'test_sgd_op_bf16',
+    'test_model_cast_to_bf16',
+    'test_hybrid_parallel_topology',
+    'barrier_table_test',
+    'test_check_error',
+    'test_fleet_lamb_meta_optimizer',
+    'test_fleet_rolemaker_2',
+    'test_distributed_strategy',
+    'test_rnn_cudnn_params_packing',
+    'test_communicator_async',
+    'brpc_utils_test',
+    'test_analyzer_capi_pd_tensor',
+    'test_recv_save_op',
+    'heter_listen_and_server_test',
+    'test_analyzer_capi_ner',
+    'test_unsqueeze2_eltwise_fuse_pass',
+    'test_dgc_optimizer',
+    'test_fleet_cc',
+    'test_repeated_fc_relu_fuse_pass_cc',
+    'heter_server_test',
+    'test_static_save_load_large',
+    'graph_node_test',
+    'test_custom_conj',
+    'test_fleet_private_function',
+    'test_fake_init_op',
+    'brpc_service_sparse_sgd_test',
+    'test_tf32_cudnn',
+    'test_communicator_geo',
+    'test_dispatch_jit',
+    'test_layer_norm_fuse_pass_cc',
+    'test_fleet_dgc_meta_optimizer',
+    'test_fc_fuse_pass_cc',
+    'test_communicator_sync',
+    'test_analyzer_capi',
+    'test_fleet_lars_meta_optimizer',
+    'test_communicator_half_async',
+    'test_fleet_localsgd_meta_optimizer',
+    'test_fleet_amp_init',
+    'test_fleet_checkpoint',
+    'test_analyzer_seq_pool1_fuse_compare_zero_copy',
+    'test_lookup_table_bf16_op',
+    'test_fleet_meta_optimizer_base',
+    'table_test',
+    'test_fleet_rolemaker_new',
+    'test_fleet_graph_executor',
+    'test_multi_out_jit',
+    'test_fleet_utils',
+    'brpc_service_dense_sgd_test',
 ]
 
-# It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
+# It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED,
 # just remove it from this list.
 TETRAD_PARALLEL_JOB = [
     'buffered_allocator_test',
@@ -478,11 +640,56 @@
     'tensor_test',
     'test_repeated_fc_relu_fuse_pass_cc',
     'test_mkldnn_caching',
+    'test_analyzer_seq_pool1',
+    'test_analyzer_ocr',
+    'test_analyzer_seq_conv1',
+    'test_analyzer_small_dam',
+    'test_analyzer_mobilenet_depthwise_conv',
+    'test_analyzer_pyramid_dnn',
+    'test_analyzer_text_classification',
+    'test_analyzer_rnn2',
+    'test_analyzer_transformer',
+    'test_analyzer_resnet50',
+    'test_analyzer_ner',
+    'test_analyzer_lac',
+    'test_analyzer_transformer_profile',
+    'test_analyzer_mobilenet_transpose',
+    'test_analyzer_rnn1',
+    'test_analyzer_seq_pool1_profile',
+    'test_analyzer_paddletensor_tensor',
+    'test_analyzer_bert',
+    'test_analyzer_googlenet',
+    'zero_copy_tensor_test',
+    'custom_tensor_test',
+    'test_fleet_base',
+    'test_imperative_container_layerdict',
+    'test_complex_simplenet',
+    'test_tensor_register_hook',
+    'test_set_value_op',
+    'test_tensor_type_promotion',
+    'test_view_op_reuse_allocation',
+    'test_complex_grad_accumulated',
+    'test_sequential',
+    'test_sequential',
+    'test_imperative_layers',
+    'test_dgc_momentum_op',
+    'test_memcpy_op',
+    'test_dgc_op',
+    'test_modelaverage',
+    'test_lookahead',
+    'test_callback_visualdl',
+    'test_new_group_api',
+    'test_collective_split_embedding_none_divisible',
+    'test_collective_wait',
+    'test_collective_split_row_linear',
+    'test_collective_split_col_linear',
+    'test_collective_split_embedding',
 ]
 
-# It run 2 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
+# It run 2 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED,
 # just remove it from this list.
 TWO_PARALLEL_JOB = [
+    'convert_model2dot_ernie',
     'im2col_test',
     'test_elementwise_add_grad_grad',
     'test_logical_op',
@@ -512,13 +719,10 @@
     'test_activation_nn_grad',
     'test_pool2d_int8_mkldnn_op',
     'test_adagrad_op_v2',
-    'test_elementwise_add_op',
     'test_nn_functional_hot_op',
     'test_op_name_conflict',
-    'test_softmax_with_cross_entropy_op',
     'test_imperative_gan',
     'test_simnet',
-    'test_instance_norm_op',
     'test_amp_check_finite_and_scale_op',
     'test_random_seed',
     'test_histogram_op',
@@ -539,7 +743,6 @@
     'test_sigmoid_cross_entropy_with_logits_op',
     'test_regularizer_api',
     'test_lrn_op',
-    'test_rank_attention_op',
     'test_parallel_ssa_graph_inference_feed_partial_data',
     'test_lod_reset_op',
     'test_install_check',
@@ -554,14 +757,12 @@
     'test_gather_tree_op',
     'test_decoupled_py_reader',
     'test_imperative_named_members',
-    'test_conv3d_op',
     'test_seqconv_eltadd_relu_fuse_pass',
     'test_analysis_predictor',
     'test_convert_operators',
     'test_add_reader_dependency',
     'test_is_tensor',
     'test_variable',
-    'test_unsqueeze_op',
     'test_save_model_without_var',
     'test_unfold_op',
     'test_conv_bn_fuse_pass',
@@ -617,8 +818,6 @@
     'test_adam_op_multi_thread',
     'test_adamax_op',
     'test_while_loop_op',
-    'test_affine_grid_function',
-    'test_trilinear_interp_op',
     'test_transpose_flatten_concat_fuse_pass',
     'test_trace_op',
     'test_backward',
@@ -670,7 +869,6 @@
     'test_gather_op',
     'test_partial_concat_op',
     'test_gaussian_random_op',
-    'test_paddle_imperative_double_grad',
     'test_generate_proposals_v2_op',
     'test_pad_constant_like',
     'test_grid_sample_function',
@@ -813,7 +1011,6 @@
     'test_sequence_expand_as',
     'test_sequence_reverse',
     'test_shape_op',
-    'test_lod_tensor',
     'test_diag',
     'test_strided_slice_op',
     'test_switch_case',
@@ -831,7 +1028,6 @@
     'test_arange',
     'test_lrn_mkldnn_op',
     'test_imperative_gnn',
-    'test_eager_deletion_while_op',
     'test_dequantize_abs_max_op',
     'test_elementwise_mul_op',
     'test_tensor_scalar_type_promotion_dynamic',
@@ -873,7 +1069,6 @@
     'test_manual_seed',
     'test_buffer_shared_memory_reuse_pass',
     'test_range',
-    'test_activation_op',
     'test_box_decoder_and_assign_op',
     'test_imperative_optimizer_v2',
     'test_python_operator_overriding',
@@ -889,6 +1084,11 @@
     'test_imperative_load_static_param',
     'test_fuse_bn_add_act_pass',
     'test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass',
+    'test_quantize_transpiler_v2',
+    'paddle_infer_api_test',
+    'test_analyzer_ernie',
+    'lite_resnet50_test',
+    'lite_mul_model_test',
 ]
 
 
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index a18774a8b57b64..6de9d84379fea5 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -25,26 +25,30 @@
 import sys
 import pydoc
 import hashlib
-import six
+import platform
 import functools
-import logging
 
 member_dict = collections.OrderedDict()
 
 visited_modules = set()
 
-# APIs that should not be printed into API.spec 
-omitted_list = [
-    "paddle.fluid.LoDTensor.set",  # Do not know why it should be omitted
-    "paddle.fluid.io.ComposeNotAligned",
-    "paddle.fluid.io.ComposeNotAligned.__init__",
-]
-
 
 def md5(doc):
-    hash = hashlib.md5()
-    hash.update(str(doc).encode('utf-8'))
-    return hash.hexdigest()
+    try:
+        hashinst = hashlib.md5()
+        if platform.python_version()[0] == "2":
+            hashinst.update(str(doc))
+        else:
+            hashinst.update(str(doc).encode('utf-8'))
+        md5sum = hashinst.hexdigest()
+    except UnicodeDecodeError as e:
+        md5sum = None
+        print(
+            "Error({}) occurred when `md5({})`, discard it.".format(
+                str(e), doc),
+            file=sys.stderr)
+
+    return md5sum
 
 
 def get_functools_partial_spec(func):
@@ -74,13 +78,28 @@ def format_spec(spec):
 
 
 def queue_dict(member, cur_name):
-    if cur_name in omitted_list:
-        return
-
-    doc_md5 = md5(member.__doc__)
-
-    if inspect.isclass(member):
+    if cur_name != 'paddle':
+        try:
+            eval(cur_name)
+        except (AttributeError, NameError, SyntaxError) as e:
+            print(
+                "Error({}) occurred when `eval({})`, discard it.".format(
+                    str(e), cur_name),
+                file=sys.stderr)
+            return
+
+    if (inspect.isclass(member) or inspect.isfunction(member) or
+            inspect.ismethod(member)) and hasattr(
+                member, '__module__') and hasattr(member, '__name__'):
         args = member.__module__ + "." + member.__name__
+        try:
+            eval(args)
+        except (AttributeError, NameError, SyntaxError) as e:
+            print(
+                "Error({}) occurred when `eval({})`, discard it for {}.".format(
+                    str(e), args, cur_name),
+                file=sys.stderr)
+            return
     else:
         try:
             args = inspect.getargspec(member)
@@ -95,6 +114,7 @@ def queue_dict(member, cur_name):
         if not has_type_error:
             args = format_spec(args)
 
+    doc_md5 = md5(member.__doc__)
     member_dict[cur_name] = "({}, ('document', '{}'))".format(args, doc_md5)
 
 
@@ -106,8 +126,7 @@ def visit_member(parent_name, member, member_name=None):
     if inspect.isclass(member):
         queue_dict(member, cur_name)
         for name, value in inspect.getmembers(member):
-            if hasattr(value, '__name__') and (not name.startswith("_") or
-                                               name == "__init__"):
+            if hasattr(value, '__name__') and not name.startswith("_"):
                 visit_member(cur_name, value)
     elif inspect.ismethoddescriptor(member):
         return
@@ -123,7 +142,7 @@ def visit_member(parent_name, member, member_name=None):
 
 
 def is_primitive(instance):
-    int_types = (int, long) if six.PY2 else (int, )
+    int_types = (int, long) if platform.python_version()[0] == "2" else (int, )
     pritimitive_types = int_types + (float, str)
     if isinstance(instance, pritimitive_types):
         return True
@@ -149,11 +168,14 @@ def visit_all_module(mod):
         return
 
     visited_modules.add(mod)
-
-    for member_name in (
-            name
-            for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod))
-            if not name.startswith("_")):
+    if hasattr(mod, "__all__"):
+        member_names = (name for name in mod.__all__
+                        if not name.startswith("_"))
+    elif mod_name == 'paddle':
+        member_names = dir(mod)
+    else:
+        return
+    for member_name in member_names:
         instance = getattr(mod, member_name, None)
         if instance is None:
             continue
@@ -168,17 +190,20 @@ def visit_all_module(mod):
             visit_all_module(instance)
         else:
             if member_name != instance.__name__:
-                logging.warn(
+                print(
                     "Found alias API, alias name is: {}, original name is: {}".
-                    format(member_name, instance.__name__))
+                    format(member_name, instance.__name__),
+                    file=sys.stderr)
                 visit_member(mod.__name__, instance, member_name)
             else:
                 visit_member(mod.__name__, instance)
 
 
-modules = sys.argv[1].split(",")
-for m in modules:
-    visit_all_module(importlib.import_module(m))
+if __name__ == '__main__':
+    import paddle
+    modules = sys.argv[1].split(",")
+    for m in modules:
+        visit_all_module(importlib.import_module(m))
 
-for name in member_dict:
-    print(name, member_dict[name])
+    for name in member_dict:
+        print(name, member_dict[name])
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index fde01329340b2b..52777cd59ba253 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -19,8 +19,6 @@
 import math
 import platform
 import inspect
-import paddle
-import paddle.fluid
 import json
 import argparse
 import shutil
@@ -28,8 +26,8 @@
 import logging
 """
 please make sure to run in the tools path
-usage: python sample_test.py {arg1} 
-arg1: the first arg defined running in gpu version or cpu version
+usage: python sample_test.py {cpu or gpu} 
+    {cpu or gpu}: running in cpu version or gpu version
 
 for example, you can run cpu version python2 testing like this:
 
@@ -44,9 +42,7 @@
 else:
     console = logging.StreamHandler()
     logger.addHandler(console)
-console.setFormatter(
-    logging.Formatter(
-        "%(asctime)s - %(funcName)s:%(lineno)d - %(levelname)s - %(message)s"))
+console.setFormatter(logging.Formatter("%(message)s"))
 
 RUN_ON_DEVICE = 'cpu'
 GPU_ID = 0
@@ -107,11 +103,9 @@ def check_indent(cdline):
     return indent
 
 
-# srccom: raw comments in the source,including ''' and original indent
-def sampcd_extract_and_run(srccom, name, htype="def", hname=""):
+def sampcd_extract_to_file(srccom, name, htype="def", hname=""):
     """
-    Extract and run sample codes from source comment and
-    the result will be returned.
+    Extract sample codes from __doc__, and write them to files.
 
     Args:
         srccom(str): the source comment of some API whose
@@ -121,35 +115,12 @@ def sampcd_extract_and_run(srccom, name, htype="def", hname=""):
         hname(str): the name of the hint  banners , e.t. def hname.
 
     Returns:
-        result: True or False
-        name(str): the name of the API.
-        msg(str): messages
+        sample_code_filenames(list of str)
     """
     global GPU_ID, RUN_ON_DEVICE, SAMPLECODE_TEMPDIR
+    CODE_BLOCK_INTERDUCTORY = "code-block:: python"
 
-    result = True
-    msg = None
-
-    def sampcd_header_print(name, sampcd, htype, hname):
-        """
-        print hint banner headers.
-
-        Args:
-            name(str): the name of the API.
-            sampcd(str): sample code string
-            htype(str): the type of hint banners, def/class/method.
-            hname(str): the name of the hint  banners , e.t. def hname.
-            flushed.
-        """
-        print(htype, " name:", hname)
-        print("-----------------------")
-        print("Sample code ", str(y), " extracted for ", name, "   :")
-        print(sampcd)
-        print("----example code check----\n")
-        print("executing sample code .....")
-        print("execution result:")
-
-    sampcd_begins = find_all(srccom, " code-block:: python")
+    sampcd_begins = find_all(srccom, CODE_BLOCK_INTERDUCTORY)
     if len(sampcd_begins) == 0:
         # detect sample codes using >>> to format and consider this situation as wrong
         print(htype, " name:", hname)
@@ -161,14 +132,14 @@ def sampcd_header_print(name, sampcd, htype, hname):
                     "Deprecated sample code style:\n\n    Examples:\n\n        >>>codeline\n        >>>codeline\n\n\n ",
                     "Please use '.. code-block:: python' to ",
                     "format sample code.\n")
-                result = False
+                return []
         else:
             print("Error: No sample code!\n")
-            result = False
-
+            return []
+    sample_code_filenames = []
     for y in range(1, len(sampcd_begins) + 1):
         sampcd_begin = sampcd_begins[y - 1]
-        sampcd = srccom[sampcd_begin + len(" code-block:: python") + 1:]
+        sampcd = srccom[sampcd_begin + len(CODE_BLOCK_INTERDUCTORY) + 1:]
         sampcd = sampcd.split("\n")
         # remove starting empty lines
         while sampcd[0].replace(' ', '').replace('\t', '') == '':
@@ -200,361 +171,90 @@ def sampcd_header_print(name, sampcd, htype, hname):
 
         tfname = os.path.join(SAMPLECODE_TEMPDIR, '{}_example{}'.format(
             name, '.py' if len(sampcd_begins) == 1 else '_{}.py'.format(y)))
-        logging.info('running %s', tfname)
         with open(tfname, 'w') as tempf:
             tempf.write(sampcd)
-        if platform.python_version()[0] == "2":
-            cmd = ["python", tfname]
-        elif platform.python_version()[0] == "3":
-            cmd = ["python3", tfname]
-        else:
-            print("Error: fail to parse python version!")
-            result = False
-            exit(1)
-
-        subprc = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        output, error = subprc.communicate()
-        msg = "".join(output.decode(encoding='utf-8'))
-        err = "".join(error.decode(encoding='utf-8'))
+        sample_code_filenames.append(tfname)
+    return sample_code_filenames
 
-        if subprc.returncode != 0:
-            print("\nSample code error found in ", name, ":\n")
-            sampcd_header_print(name, sampcd, htype, hname)
-            print("subprocess return code: ", str(subprc.returncode))
-            print("Error Raised from Sample Code ", name, " :\n")
-            print(err)
-            print(msg)
-            logging.warning('%s error: %s', tfname, err)
-            logging.warning('%s msg: %s', tfname, msg)
-            result = False
-        # msg is the returned code execution report
 
-    return result, name, msg
-
-
-def single_defcom_extract(start_from, srcls, is_class_begin=False):
+def execute_samplecode(tfname):
     """
-    to extract a def function/class/method comments body
+    Execute a sample-code test.
 
     Args:
-        start_from(int): the line num of "def" header
-        srcls(list): the source file in lines
-        is_class_begin(bool): whether the start_from is a beginning a class. \
-        For a sole class body itself may end up with its method if it has no
-        docstring. But the body of \
-        a common def function can only be ended up by a none-indented def/class
-
+        tfname: the filename of the samplecode.
+    
     Returns:
-        string : the extracted comment body, inclusive of its quote marks.
-
-    """
-
-    i = start_from
-    fcombody = ""  # def comment body
-    comstart = -1  # the starting line index of comment mark "'''" or """"""
-    # if it is not -1, it indicates the loop is in the comment body
-    comstyle = 0  # comment mark style ,comments quoted with ''' is coded as 1
-    # comments quoted with """ is coded as 2
-    for x in range(i + 1, len(srcls)):
-        if is_class_begin:
-            if srcls[x].replace('\t', '    ').startswith('    def '):
-                break
-        if srcls[x].startswith('def ') or srcls[x].startswith('class '):
-            break
-        else:
-            if comstart == -1:
-                s = srcls[x].replace(" ", '').replace("\t",
-                                                      '').replace("\n", '')
-                if s.startswith("\"\"\"") or s.startswith("r\"\"\""):
-                    comstart = x
-                    comstyle = 2
-                    continue
-            if (comstyle == 2 and comstart != -1 and
-                    srcls[x].replace(" ", '').replace("\t", '').replace(
-                        "\n", '').startswith("\"\"\"")):
-                break
-            if comstart == -1:
-                s = srcls[x].replace(" ", '').replace("\t",
-                                                      '').replace("\n", '')
-                if s.startswith("\'\'\'") or s.startswith("r\'\'\'"):
-                    comstart = x
-                    comstyle = 1
-                    continue
-            if (comstyle == 1 and comstart != -1 and
-                    srcls[x].replace(" ", '').replace("\t", '').replace(
-                        "\n", '').startswith("\'\'\'")):
-                break
-            if (comstart !=
-                    -1):  # when the comments start, begin to add line to fcombody
-                fcombody += srcls[x]
-    return fcombody
-
-
-def srccoms_extract(srcfile, wlist, methods):
+        result: success or not
+        tfname: same as the input argument
+        msg: the stdout output of the samplecode executing.
     """
-    Given a source file ``srcfile``, this function will
-    extract its API(doc comments) and run sample codes in the
-    API.
-
-    Args:
-        srcfile(file): the source file
-        wlist(list): white list
-        methods(list): only elements of this list considered.
-
-    Returns:
-        result: True or False
-        error_methods: the methods that failed.
-    """
-
-    process_result = True
-    error_methods = []
-    srcc = srcfile.read()
-    # 2. get defs and classes header line number
-    # set file pointer to its beginning
-    srcfile.seek(0, 0)
-    srcls = srcfile.readlines()  # source lines
-
-    # 1. fetch__all__ list
-    allidx = srcc.find("__all__")
-    logger.debug('processing %s, methods: %s', srcfile.name, str(methods))
-    srcfile_new, _ = os.path.splitext(srcfile.name)
-    srcfile_list = srcfile_new.split('/')
-    srcfile_str = ''
-    for i in range(4, len(srcfile_list)):
-        srcfile_str = srcfile_str + srcfile_list[i] + '.'
-    if allidx != -1:
-        alllist = []
-        # get all list for layers/ops.py
-        if srcfile.name.find("ops.py") != -1:
-            for ai in range(0, len(srcls)):
-                if srcls[ai].startswith("__all__"):
-                    lb = srcls[ai].find('[')
-                    rb = srcls[ai].find(']')
-                    if lb == -1:
-                        continue
-                    allele = srcls[ai][lb + 1:rb].replace("'", '').replace(
-                        " ", '').replace("\"", '')
-                    alllist.append(allele)
-            if '' in alllist:
-                alllist.remove('')
-        else:
-            alllist_b = allidx + len("__all__")
-            allstr = srcc[alllist_b + srcc[alllist_b:].find("[") + 1:alllist_b +
-                          srcc[alllist_b:].find("]")]
-            allstr = allstr.replace("\n", '').replace(" ", '').replace(
-                "'", '').replace("\"", '')
-            alllist = allstr.split(',')
-            if '' in alllist:
-                alllist.remove('')
-        api_alllist_count = len(alllist)
-        logger.debug('found %d items: %s', api_alllist_count, str(alllist))
-        api_count = 0
-        handled = []
-        # get src contents in layers/ops.py
-        if srcfile.name.find("ops.py") != -1:
-            for i in range(0, len(srcls)):
-                opname = None
-                opres = re.match(r"^(\w+)\.__doc__", srcls[i])
-                if opres is not None:
-                    opname = opres.group(1)
-                else:
-                    opres = re.match(
-                        r"^add_sample_code\(globals\(\)\[\"(\w+)\"\]", srcls[i])
-                    if opres is not None:
-                        opname = opres.group(1)
-                if opname is not None:
-                    if opname in wlist:
-                        logger.info('%s is in the whitelist, skip it.', opname)
-                        continue
-                    else:
-                        logger.debug('%s\'s docstring found.', opname)
-                    comstart = i
-                    for j in range(i, len(srcls)):
-                        if srcls[j].find("\"\"\"") != -1:
-                            comstart = i
-                    opcom = ""
-                    for j in range(comstart + 1, len(srcls)):
-                        opcom += srcls[j]
-                        if srcls[j].find("\"\"\"") != -1:
-                            break
-                    result, _, _ = sampcd_extract_and_run(opcom, opname, "def",
-                                                          opname)
-                    if not result:
-                        error_methods.append(opname)
-                        process_result = False
-                    api_count += 1
-                    handled.append(
-                        opname)  # ops.py also has normal formatted functions
-                    # use list 'handled'  to mark the functions have been handled here
-                    # which will be ignored in the following step
-                    # handled what?
-        logger.debug('%s already handled.', str(handled))
-        for i in range(0, len(srcls)):
-            if srcls[i].startswith(
-                    'def '):  # a function header is detected in line i
-                f_header = srcls[i].replace(" ", '')
-                fn = f_header[len('def'):f_header.find('(')]  # function name
-                if "%s%s" % (srcfile_str, fn) not in methods:
-                    logger.info(
-                        '[file:%s, function:%s] not in methods list, skip it.',
-                        srcfile_str, fn)
-                    continue
-                if fn in handled:
-                    continue
-                if fn in alllist:
-                    api_count += 1
-                    if fn in wlist or fn + "@" + srcfile.name in wlist:
-                        logger.info('[file:%s, function:%s] skip by wlist.',
-                                    srcfile_str, fn)
-                        continue
-                    fcombody = single_defcom_extract(i, srcls)
-                    if fcombody == "":  # if no comment
-                        print("def name:", fn)
-                        print("-----------------------")
-                        print("WARNING: no comments in function ", fn,
-                              ", but it deserves.")
-                        continue
-                    else:
-                        result, _, _ = sampcd_extract_and_run(fcombody, fn,
-                                                              "def", fn)
-                        if not result:
-                            error_methods.append(fn)
-                            process_result = False
-
-            if srcls[i].startswith('class '):
-                c_header = srcls[i].replace(" ", '')
-                cn = c_header[len('class'):c_header.find('(')]  # class name
-                if '%s%s' % (srcfile_str, cn) not in methods:
-                    logger.info(
-                        '[file:%s, class:%s] not in methods list, skip it.',
-                        srcfile_str, cn)
-                    continue
-                if cn in handled:
-                    continue
-                if cn in alllist:
-                    api_count += 1
-                    if cn in wlist or cn + "@" + srcfile.name in wlist:
-                        logger.info('[file:%s, class:%s] skip by wlist.',
-                                    srcfile_str, cn)
-                        continue
-                    # class comment
-                    classcom = single_defcom_extract(i, srcls, True)
-                    if classcom != "":
-                        result, _, _ = sampcd_extract_and_run(classcom, cn,
-                                                              "class", cn)
-                        if not result:
-                            error_methods.append(cn)
-                            process_result = False
-                    else:
-                        print("WARNING: no comments in class itself ", cn,
-                              ", but it deserves.\n")
-                    # handling methods in class bodies
-                    for x in range(
-                            i + 1,
-                            len(srcls)):  # from the next line of class header
-                        if (srcls[x].startswith('def ') or
-                                srcls[x].startswith('class ')):
-                            break
-                        else:
-                            # member method def header
-                            srcls[x] = srcls[x].replace('\t', '    ')
-                            if (srcls[x].startswith(
-                                    '    def ')):  # detect a mehtod header..
-                                thisl = srcls[x]
-                                indent = len(thisl) - len(thisl.lstrip())
-                                mn = thisl[indent + len('def '):thisl.find(
-                                    '(')]  # method name
-                                name = cn + "." + mn  # full name
-                                if '%s%s' % (
-                                        srcfile_str, name
-                                ) not in methods:  # class method not in api.spec 
-                                    logger.info(
-                                        '[file:%s, func:%s] not in methods, skip it.',
-                                        srcfile_str, name)
-                                    continue
-                                if mn.startswith('_'):
-                                    logger.info(
-                                        '[file:%s, func:%s] startswith _, it\'s private method, skip it.',
-                                        srcfile_str, name)
-                                    continue
-                                if name in wlist or name + "@" + srcfile.name in wlist:
-                                    logger.info(
-                                        '[file:%s, class:%s] skip by wlist.',
-                                        srcfile_str, name)
-                                    continue
-                                thismethod = [thisl[indent:]
-                                              ]  # method body lines
-                                # get all the lines of a single method body
-                                # into thismethod(list)
-                                # and send it to single_defcom_extract
-                                for y in range(x + 1, len(srcls)):
-                                    srcls[y] = srcls[y].replace('\t', '    ')
-                                    if (srcls[y].startswith('def ') or
-                                            srcls[y].startswith('class ')):
-                                        # end of method
-                                        break
-                                    elif srcls[y].startswith('    def '):
-                                        # end of method
-                                        break
-                                    else:
-                                        thismethod.append(srcls[y][indent:])
-                                thismtdcom = single_defcom_extract(0,
-                                                                   thismethod)
-                                if thismtdcom != "":
-                                    result, _, _ = sampcd_extract_and_run(
-                                        thismtdcom, name, "method", name)
-                                    if not result:
-                                        error_methods.append(name)
-                                        process_result = False
+    result = True
+    msg = None
+    if platform.python_version()[0] in ["2", "3"]:
+        cmd = [sys.executable, tfname]
     else:
-        logger.warning('__all__ not found in file:%s', srcfile.name)
-
-    return process_result, error_methods
-
+        print("Error: fail to parse python version!")
+        result = False
+        exit(1)
 
-def test(file_list):
-    global methods  # readonly
-    process_result = True
-    for file in file_list:
-        with open(file, 'r') as src:
-            if not srccoms_extract(src, wlist, methods):
-                process_result = False
-    return process_result
+    # check required envisonment
+    with open(tfname, 'r') as f:
+        for line in f.readlines():
+            if re.match(r'#\s*required\s*:\s*(distributed|gpu|skip)', line):
+                result = True
+                return result, tfname, '{} is skipped. cause: {}'.format(tfname,
+                                                                         line)
+
+    logging.info('running %s', tfname)
+    print("\n----example code check----")
+    print("executing sample code .....", tfname)
+    subprc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    output, error = subprc.communicate()
+    msg = "".join(output.decode(encoding='utf-8'))
+    err = "".join(error.decode(encoding='utf-8'))
+
+    if subprc.returncode != 0:
+        print("Sample code error found in ", tfname, ":")
+        print("-----------------------")
+        print(open(tfname).read())
+        print("-----------------------")
+        print("subprocess return code: ", str(subprc.returncode))
+        print("Error Raised from Sample Code ", tfname, " :")
+        print(err)
+        print(msg)
+        print("----example code check failed----\n")
+        logging.warning('%s error: %s', tfname, err)
+        logging.warning('%s msg: %s', tfname, msg)
+        result = False
+    else:
+        print("----example code check success----\n")
 
-
-def run_a_test(tc_filename):
-    """
-    execute a sample code-block.
-    """
-    global methods  # readonly
-    process_result = True
-    with open(tc_filename, 'r') as src:
-        process_result, error_methods = srccoms_extract(src, wlist, methods)
-    return process_result, tc_filename, error_methods
+    # msg is the returned code execution report
+    return result, tfname, msg
 
 
 def get_filenames():
     '''
-    this function will get the modules that pending for check.
+    this function will get the sample code files that pending for check.
 
     Returns:
 
-        list: the modules pending for check .
+        dict: the sample code files pending for check .
 
     '''
-    filenames = []
     global methods  # write
     global whl_error
-    methods = []
+    import paddle
     whl_error = []
     get_incrementapi()
-    API_spec = API_DIFF_SPEC_FN
-    with open(API_spec) as f:
+    all_sample_code_filenames = {}
+    with open(API_DIFF_SPEC_FN) as f:
         for line in f.readlines():
             api = line.replace('\n', '')
             try:
-                module = eval(api).__module__
+                api_obj = eval(api)
             except AttributeError:
                 whl_error.append(api)
                 continue
@@ -562,59 +262,37 @@ def get_filenames():
                 logger.warning('line:%s, api:%s', line, api)
                 # paddle.Tensor.<lambda>
                 continue
-            if len(module.split('.')) > 1:
-                filename = '../python/'
-                # work for .so?
-                module_py = '%s.py' % module.split('.')[-1]
-                for i in range(0, len(module.split('.')) - 1):
-                    filename = filename + '%s/' % module.split('.')[i]
-                filename = filename + module_py
-            else:
-                filename = ''
-                logger.warning("WARNING: Exception in getting api:%s module:%s",
-                               api, module)
-            if filename in filenames:
-                continue
-            elif not filename:
-                logger.warning('filename invalid: %s', line)
-                continue
-            elif not os.path.exists(filename):
-                logger.warning('file not exists: %s', filename)
-                continue
-            else:
-                filenames.append(filename)
-            # get all methods
-            method = ''
-            if inspect.isclass(eval(api)):
-                name = api.split('.')[-1]
-            elif inspect.isfunction(eval(api)):
-                name = api.split('.')[-1]
-            elif inspect.ismethod(eval(api)):
-                name = '%s.%s' % (api.split('.')[-2], api.split('.')[-1])
-            else:
-                name = ''
-                logger.warning(
-                    "WARNING: Exception when getting api:%s, line:%s", api,
-                    line)
-            for j in range(2, len(module.split('.'))):
-                method = method + '%s.' % module.split('.')[j]
-            method = method + name
-            if method not in methods:
-                methods.append(method)
-    os.remove(API_spec)
-    return filenames
+            if hasattr(api_obj, '__doc__') and api_obj.__doc__:
+                sample_code_filenames = sampcd_extract_to_file(api_obj.__doc__,
+                                                               api)
+                for tfname in sample_code_filenames:
+                    all_sample_code_filenames[tfname] = api
+    return all_sample_code_filenames
 
 
 def get_api_md5(path):
+    """
+    read the api spec file, and scratch the md5sum value of every api's docstring.
+
+    Args:
+        path: the api spec file. ATTENTION the path relative
+    
+    Returns:
+        api_md5(dict): key is the api's real fullname, value is the md5sum.
+    """
     api_md5 = {}
     API_spec = '%s/%s' % (os.path.abspath(os.path.join(os.getcwd(), "..")),
                           path)
+    pat = re.compile(r'\((paddle[^,]+)\W*document\W*([0-9a-z]{32})')
+    patArgSpec = re.compile(
+        r'^(paddle[^,]+)\s+\(ArgSpec.*document\W*([0-9a-z]{32})')
     with open(API_spec) as f:
         for line in f.readlines():
-            api = line.split(' ', 1)[0]
-            md5 = line.split("'document', ")[1].replace(')', '').replace('\n',
-                                                                         '')
-            api_md5[api] = md5
+            mo = pat.search(line)
+            if not mo:
+                mo = patArgSpec.search(line)
+            if mo:
+                api_md5[mo.group(1)] = mo.group(2)
     return api_md5
 
 
@@ -629,9 +307,12 @@ def get_incrementapi():
         for key in pr_api:
             if key in dev_api:
                 if dev_api[key] != pr_api[key]:
+                    logger.debug("%s in dev is %s, different from pr's %s", key,
+                                 dev_api[key], pr_api[key])
                     f.write(key)
                     f.write('\n')
             else:
+                logger.debug("%s is not in dev", key)
                 f.write(key)
                 f.write('\n')
 
@@ -737,22 +418,13 @@ def parse_args():
     if len(filenames) == 0 and len(whl_error) == 0:
         logger.info("-----API_PR.spec is the same as API_DEV.spec-----")
         exit(0)
-    rm_file = []
-    for f in filenames:
-        for w_file in wlist_file:
-            if f.startswith(w_file):
-                rm_file.append(f)
-                filenames.remove(f)
-    if len(rm_file) != 0:
-        logger.info("REMOVE white files: %s", rm_file)
     logger.info("API_PR is diff from API_DEV: %s", filenames)
 
     threads = multiprocessing.cpu_count()
     if args.threads:
         threads = args.threads
     po = multiprocessing.Pool(threads)
-    # results = po.map_async(test, divided_file_list)
-    results = po.map_async(run_a_test, filenames)
+    results = po.map_async(execute_samplecode, filenames.keys())
     po.close()
     po.join()
 
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 5de4bffd1601c8..15bcae826064d9 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -498,6 +498,7 @@
     'test_truncated_gaussian_random_op',
     'test_unbind_op',
     'test_unfold_op',
+    'test_uniform_random_bf16_op',
     'test_uniform_random_op',
     'test_unique',
     'test_unique_with_counts',
@@ -603,7 +604,9 @@
     'test_fc_mkldnn_op',
     'test_fc_bf16_mkldnn_op',
     'test_nearest_interp_mkldnn_op',
+    'test_nearest_interp_v2_mkldnn_op',
     'test_bilinear_interp_mkldnn_op',
+    'test_bilinear_interp_v2_mkldnn_op',
     'test_fusion_gru_int8_mkldnn_op',
     'test_fusion_gru_bf16_mkldnn_op',
     'test_fusion_gru_mkldnn_op',
diff --git a/tools/summary_env.py b/tools/summary_env.py
index 38bae87651d4b2..d12e644cc28daa 100644
--- a/tools/summary_env.py
+++ b/tools/summary_env.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import os
 import sys
+import distro
 import platform
 import subprocess
 
@@ -47,8 +48,8 @@ def get_os_info():
         plat = "macOs"
         ver = platform.mac_ver()[0]
     elif platform.system() == "Linux":
-        plat = platform.linux_distribution()[0]
-        ver = platform.linux_distribution()[1]
+        plat = distro.linux_distribution()[0]
+        ver = distro.linux_distribution()[1]
     elif platform.system() == "Windows":
         plat = "Windows"
         ver = platform.win32_ver()[0]
diff --git a/tools/test_print_signatures.py b/tools/test_print_signatures.py
new file mode 100644
index 00000000000000..7cbdbb56cb1b10
--- /dev/null
+++ b/tools/test_print_signatures.py
@@ -0,0 +1,95 @@
+#! /usr/bin/env python
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+TestCases for print_signatures.py
+
+sample lines from API_DEV.spec:
+    paddle.autograd.backward (ArgSpec(args=['tensors', 'grad_tensors', 'retain_graph'], varargs=None, keywords=None, defaults=(None, False)), ('document', '33a4434c9d123331499334fbe0274870'))
+    paddle.autograd.PyLayer (paddle.autograd.py_layer.PyLayer, ('document', 'c26adbbf5f1eb43d16d4a399242c979e'))
+    paddle.autograd.PyLayer.apply (ArgSpec(args=['cls'], varargs=args, keywords=kwargs, defaults=None), ('document', 'cb78696dc032fb8af2cba8504153154d'))
+"""
+import unittest
+import hashlib
+import inspect
+import functools
+from print_signatures import md5
+from print_signatures import get_functools_partial_spec
+from print_signatures import format_spec
+from print_signatures import queue_dict
+from print_signatures import member_dict
+
+
+def func_example(param_a, param_b):
+    """
+    example function
+    """
+    pass
+
+
+def func_example_2(func=functools.partial(func_example, 1)):
+    """
+    example function 2
+    """
+    pass
+
+
+class ClassExample():
+    """
+    example Class
+    """
+
+    def example_method(self):
+        """
+        class method
+        """
+        pass
+
+
+class Test_all_in_print_signatures(unittest.TestCase):
+    def test_md5(self):
+        algo = hashlib.md5()
+        algo.update(func_example.__doc__.encode('utf-8'))
+        digest = algo.hexdigest()
+        self.assertEqual(digest, md5(func_example.__doc__))
+
+    def test_get_functools_partial_spec(self):
+        partailed_func = functools.partial(func_example, 1)
+        # args = inspect.getargspec(partailed_func)
+        self.assertEqual('func_example(args=(1,), keywords={})',
+                         get_functools_partial_spec(partailed_func))
+
+
+class Test_format_spec(unittest.TestCase):
+    def test_normal_func_spec(self):
+        args = inspect.getargspec(func_example)
+        self.assertEqual(
+            '''ArgSpec(args=['param_a', 'param_b'], varargs=None, keywords=None, defaults=None)''',
+            format_spec(args))
+
+    def test_func_spec_with_partialedfunc_as_param_default(self):
+        # but there is no function belongs to this type in API_DEV.spec
+        args = inspect.getargspec(func_example_2)
+        self.assertEqual(
+            '''ArgSpec(args=['func'], varargs=None, keywords=None, defaults=('func_example(args=(1,), keywords={})',))''',
+            format_spec(args))
+
+
+class Test_queue_dict(unittest.TestCase):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/test_sampcd_processor.py b/tools/test_sampcd_processor.py
index d8f47d1af5815d..7836728247f50c 100644
--- a/tools/test_sampcd_processor.py
+++ b/tools/test_sampcd_processor.py
@@ -22,12 +22,13 @@
 import importlib
 from sampcd_processor import find_all
 from sampcd_processor import check_indent
-from sampcd_processor import sampcd_extract_and_run
-from sampcd_processor import single_defcom_extract
-from sampcd_processor import srccoms_extract
 from sampcd_processor import get_api_md5
 from sampcd_processor import get_incrementapi
 from sampcd_processor import get_wlist
+from sampcd_processor import sampcd_extract_to_file
+from sampcd_processor import execute_samplecode
+
+SAMPLECODE_TEMP_DIR = 'samplecode_temp'
 
 
 class Test_find_all(unittest.TestCase):
@@ -53,107 +54,95 @@ def test_indent_1_tab(self):
         self.assertEqual(4, check_indent("\thello paddle"))
 
 
-class Test_sampcd_extract_and_run(unittest.TestCase):
+class Test_execute_samplecode(unittest.TestCase):
+    def setUp(self):
+        if not os.path.exists(SAMPLECODE_TEMP_DIR):
+            os.mkdir(SAMPLECODE_TEMP_DIR)
+        self.successSampleCodeFile = os.path.join(SAMPLECODE_TEMP_DIR,
+                                                  'samplecode_success.py')
+        with open(self.successSampleCodeFile, 'w') as f:
+            f.write('print(1+1)')
+        self.failedSampleCodeFile = os.path.join(SAMPLECODE_TEMP_DIR,
+                                                 'samplecode_failed.py')
+        with open(self.failedSampleCodeFile, 'w') as f:
+            f.write('print(1/0)')
+
+    def tearDown(self):
+        os.remove(self.successSampleCodeFile)
+        os.remove(self.failedSampleCodeFile)
+
+    def test_run_success(self):
+        result, tfname, msg = execute_samplecode(self.successSampleCodeFile)
+        self.assertTrue(result)
+        self.assertEqual(self.successSampleCodeFile, tfname)
+        self.assertIsNotNone(msg)
+        self.assertLess(msg.find('skipped'), 0)
+
+    def test_run_failed(self):
+        result, tfname, msg = execute_samplecode(self.failedSampleCodeFile)
+        self.assertFalse(result)
+        self.assertEqual(self.failedSampleCodeFile, tfname)
+        self.assertIsNotNone(msg)
+        self.assertLess(msg.find('skipped'), 0)
+
+    def test_testcases_skipped(self):
+        ...
+        tfname = os.path.join(SAMPLECODE_TEMP_DIR, 'samplecode_skipped.py')
+        with open(tfname, 'w') as f:
+            f.write("# required: distributed\nprint(1/0)")
+        result, _, msg = execute_samplecode(tfname)
+        self.assertTrue(result)
+        self.assertGreaterEqual(msg.find('skipped'), 0)
+        os.remove(tfname)
+
+
+class Test_sampcd_extract_to_file(unittest.TestCase):
     def setUp(self):
-        if not os.path.exists('samplecode_temp/'):
-            os.mkdir('samplecode_temp/')
+        if not os.path.exists(SAMPLECODE_TEMP_DIR):
+            os.mkdir(SAMPLECODE_TEMP_DIR)
 
-    def test_run_a_defs_samplecode(self):
+    def tearDown(self):
+        shutil.rmtree(SAMPLECODE_TEMP_DIR)
+
+    def test_1_samplecode(self):
         comments = """
         Examples:
             .. code-block:: python
+
                 print(1+1)
         """
         funcname = 'one_plus_one'
-        res, name, msg = sampcd_extract_and_run(comments, funcname)
-        self.assertTrue(res)
-        self.assertEqual(funcname, name)
+        sample_code_filenames = sampcd_extract_to_file(comments, funcname)
+        self.assertCountEqual(
+            [os.path.join(SAMPLECODE_TEMP_DIR, funcname + '_example.py')],
+            sample_code_filenames)
 
-    def test_run_a_def_no_code(self):
+    def test_no_samplecode(self):
         comments = """
         placeholder
         """
         funcname = 'one_plus_one'
-        res, name, msg = sampcd_extract_and_run(comments, funcname)
-        self.assertFalse(res)
-        self.assertEqual(funcname, name)
+        sample_code_filenames = sampcd_extract_to_file(comments, funcname)
+        self.assertCountEqual([], sample_code_filenames)
 
-    def test_run_a_def_raise_expection(self):
+    def test_2_samplecodes(self):
         comments = """
         placeholder
         Examples:
             .. code-block:: python
-                print(1/0)
-        """
-        funcname = 'one_plus_one'
-        res, name, msg = sampcd_extract_and_run(comments, funcname)
-        self.assertFalse(res)
-        self.assertEqual(funcname, name)
 
+                print(1/0)
 
-class Test_single_defcom_extract(unittest.TestCase):
-    def test_extract_from_func(self):
-        defstr = '''
-import os
-def foo():
-            """
-            foo is a function.
-            """
-            pass
-def bar():
-            pass
-'''
-        comm = single_defcom_extract(
-            2, defstr.splitlines(True), is_class_begin=False)
-        self.assertEqual("            foo is a function.\n", comm)
-        pass
-
-    def test_extract_from_func_with_no_docstring(self):
-        defstr = '''
-import os
-def bar():
-            pass
-'''
-        comm = single_defcom_extract(
-            2, defstr.splitlines(True), is_class_begin=False)
-        self.assertEqual('', comm)
-        pass
-
-    def test_extract_from_class(self):
-        defstr = r'''
-import os
-class Foo():
-            """
-            Foo is a class.
-            second line.
-            """
-            pass
-            def bar():
-                pass
-def foo():
-            pass
-'''
-        comm = single_defcom_extract(
-            2, defstr.splitlines(True), is_class_begin=True)
-        rcomm = """            Foo is a class.
-            second line.
-"""
-        self.assertEqual(rcomm, comm)
-        pass
+            .. code-block:: python
 
-    def test_extract_from_class_with_no_docstring(self):
-        defstr = '''
-import os
-class Foo():
-            pass
-            def bar():
-                pass
-def foo():
-            pass
-'''
-        comm = single_defcom_extract(
-            0, defstr.splitlines(True), is_class_begin=True)
-        self.assertEqual('', comm)
+                print(1+1)
+        """
+        funcname = 'one_plus_one'
+        sample_code_filenames = sampcd_extract_to_file(comments, funcname)
+        self.assertCountEqual([
+            os.path.join(SAMPLECODE_TEMP_DIR, funcname + '_example_1.py'),
+            os.path.join(SAMPLECODE_TEMP_DIR, funcname + '_example_2.py')
+        ], sample_code_filenames)
 
 
 class Test_get_api_md5(unittest.TestCase):
@@ -162,10 +151,10 @@ def setUp(self):
             os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec'))
         with open(self.api_pr_spec_filename, 'w') as f:
             f.write("\n".join([
-                """one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of one_plus_one'))""",
-                """two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of two_plus_two'))""",
-                """three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of three_plus_three'))""",
-                """four_plus_four (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of four_plus_four'))""",
+                """paddle.one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6c55one'))""",
+                """paddle.two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6c55two'))""",
+                """paddle.three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6cthree'))""",
+                """paddle.four_plus_four (paddle.four_plus_four, ('document', 'ff0f188c95030158cc6398d2a6c5four'))""",
             ]))
 
     def tearDown(self):
@@ -174,11 +163,14 @@ def tearDown(self):
 
     def test_get_api_md5(self):
         res = get_api_md5('paddle/fluid/API_PR.spec')
-        self.assertEqual("'md5sum of one_plus_one'", res['one_plus_one'])
-        self.assertEqual("'md5sum of two_plus_two'", res['two_plus_two'])
-        self.assertEqual("'md5sum of three_plus_three'",
-                         res['three_plus_three'])
-        self.assertEqual("'md5sum of four_plus_four'", res['four_plus_four'])
+        self.assertEqual("ff0f188c95030158cc6398d2a6c55one",
+                         res['paddle.one_plus_one'])
+        self.assertEqual("ff0f188c95030158cc6398d2a6c55two",
+                         res['paddle.two_plus_two'])
+        self.assertEqual("ff0f188c95030158cc6398d2a6cthree",
+                         res['paddle.three_plus_three'])
+        self.assertEqual("ff0f188c95030158cc6398d2a6c5four",
+                         res['paddle.four_plus_four'])
 
 
 class Test_get_incrementapi(unittest.TestCase):
@@ -187,16 +179,16 @@ def setUp(self):
             os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec'))
         with open(self.api_pr_spec_filename, 'w') as f:
             f.write("\n".join([
-                """one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of one_plus_one'))""",
-                """two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of two_plus_two'))""",
-                """three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of three_plus_three'))""",
-                """four_plus_four (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of four_plus_four'))""",
+                """paddle.one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6c55one'))""",
+                """paddle.two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6c55two'))""",
+                """paddle.three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6cthree'))""",
+                """paddle.four_plus_four (paddle.four_plus_four, ('document', 'ff0f188c95030158cc6398d2a6c5four'))""",
             ]))
         self.api_dev_spec_filename = os.path.abspath(
             os.path.join(os.getcwd(), "..", 'paddle/fluid/API_DEV.spec'))
         with open(self.api_dev_spec_filename, 'w') as f:
             f.write("\n".join([
-                """one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of one_plus_one'))""",
+                """paddle.one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'ff0f188c95030158cc6398d2a6c55one'))""",
             ]))
         self.api_diff_spec_filename = os.path.abspath(
             os.path.join(os.getcwd(), "dev_pr_diff_api.spec"))
@@ -210,9 +202,10 @@ def test_it(self):
         get_incrementapi()
         with open(self.api_diff_spec_filename, 'r') as f:
             lines = f.readlines()
-            self.assertCountEqual(
-                ["two_plus_two\n", "three_plus_three\n", "four_plus_four\n"],
-                lines)
+            self.assertCountEqual([
+                "paddle.two_plus_two\n", "paddle.three_plus_three\n",
+                "paddle.four_plus_four\n"
+            ], lines)
 
 
 class Test_get_wlist(unittest.TestCase):
@@ -264,174 +257,6 @@ def test_get_wlist(self):
         self.assertCountEqual(["deformable_conv"], gpu_not_white)
 
 
-class Test_srccoms_extract(unittest.TestCase):
-    def setUp(self):
-        self.tmpDir = tempfile.mkdtemp()
-        sys.path.append(self.tmpDir)
-        self.api_pr_spec_filename = os.path.abspath(
-            os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec'))
-        with open(self.api_pr_spec_filename, 'w') as f:
-            f.write("\n".join([
-                """one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "one_plus_one"))""",
-                """two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "two_plus_two"))""",
-                """three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "three_plus_three"))""",
-                """four_plus_four (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "four_plus_four"))""",
-            ]))
-
-    def tearDown(self):
-        sys.path.remove(self.tmpDir)
-        shutil.rmtree(self.tmpDir)
-        os.remove(self.api_pr_spec_filename)
-
-    def test_from_ops_py(self):
-        filecont = '''
-def add_sample_code(obj, docstr):
-    pass
-
-__unary_func__ = [
-    'exp',
-]
-
-__all__ = []
-__all__ += __unary_func__
-__all__ += ['one_plus_one']
-
-def exp():
-    pass
-add_sample_code(globals()["exp"], r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.exp(x)
-        print(out)
-        # [0.67032005 0.81873075 1.10517092 1.34985881]
-""")
-
-def one_plus_one():
-            return 1+1
-
-one_plus_one.__doc__ = """
-            placeholder
-
-            Examples:
-            .. code-block:: python
-                print(1+1)
-"""
-
-__all__ += ['two_plus_two']
-def two_plus_two():
-            return 2+2
-add_sample_code(globals()["two_plus_two"], """
-            Examples:
-            .. code-block:: python
-                print(2+2)
-""")
-'''
-        pyfilename = os.path.join(self.tmpDir, 'ops.py')
-        with open(pyfilename, 'w') as pyfile:
-            pyfile.write(filecont)
-        self.assertTrue(os.path.exists(pyfilename))
-        utsp = importlib.import_module('ops')
-        print('testing srccoms_extract from ops.py')
-        methods = ['one_plus_one', 'two_plus_two', 'exp']
-        # os.remove("samplecode_temp/" "one_plus_one_example.py")
-        self.assertFalse(
-            os.path.exists("samplecode_temp/"
-                           "one_plus_one_example.py"))
-        with open(pyfilename, 'r') as pyfile:
-            res, error_methods = srccoms_extract(pyfile, [], methods)
-            self.assertTrue(res)
-        self.assertTrue(
-            os.path.exists("samplecode_temp/"
-                           "one_plus_one_example.py"))
-        os.remove("samplecode_temp/" "one_plus_one_example.py")
-        self.assertTrue(
-            os.path.exists("samplecode_temp/"
-                           "two_plus_two_example.py"))
-        os.remove("samplecode_temp/" "two_plus_two_example.py")
-        self.assertTrue(os.path.exists("samplecode_temp/" "exp_example.py"))
-        os.remove("samplecode_temp/" "exp_example.py")
-
-    def test_from_not_ops_py(self):
-        filecont = '''
-__all__ = [
-        'one_plus_one'
-]
-
-def one_plus_one():
-            """
-            placeholder
-
-            Examples:
-            .. code-block:: python
-                print(1+1)
-            """
-            return 1+1
-
-'''
-        pyfilename = os.path.join(self.tmpDir, 'opo.py')
-        with open(pyfilename, 'w') as pyfile:
-            pyfile.write(filecont)
-        utsp = importlib.import_module('opo')
-        methods = ['one_plus_one']
-        with open(pyfilename, 'r') as pyfile:
-            res, error_methods = srccoms_extract(pyfile, [], methods)
-            self.assertTrue(res)
-        self.assertTrue(
-            os.path.exists("samplecode_temp/"
-                           "one_plus_one_example.py"))
-        os.remove("samplecode_temp/" "one_plus_one_example.py")
-
-    def test_with_empty_wlist(self):
-        """
-        see test_from_ops_py
-        """
-        pass
-
-    def test_with_wlist(self):
-        filecont = '''
-__all__ = [
-        'four_plus_four',
-        'three_plus_three'
-        ]
-
-def four_plus_four():
-            """
-            placeholder
-
-            Examples:
-            .. code-block:: python
-                print(4+4)
-            """
-            return 4+4
-def three_plus_three():
-            """
-            placeholder
-
-            Examples:
-            .. code-block:: python
-                print(3+3)
-            """
-            return 3+3
-
-'''
-        pyfilename = os.path.join(self.tmpDir, 'three_and_four.py')
-        with open(pyfilename, 'w') as pyfile:
-            pyfile.write(filecont)
-        utsp = importlib.import_module('three_and_four')
-        methods = ['four_plus_four', 'three_plus_three']
-        with open(pyfilename, 'r') as pyfile:
-            res, error_methods = srccoms_extract(pyfile, ['three_plus_three'],
-                                                 methods)
-            self.assertTrue(res)
-        self.assertTrue(
-            os.path.exists("samplecode_temp/four_plus_four_example.py"))
-        os.remove("samplecode_temp/" "four_plus_four_example.py")
-        self.assertFalse(
-            os.path.exists("samplecode_temp/three_plus_three_example.py"))
-
-
 # https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/ops.py
 # why? unabled to use the ast module. emmmmm
 
diff --git a/tools/timeline.py b/tools/timeline.py
index 119018380b551c..2a399b71b77863 100644
--- a/tools/timeline.py
+++ b/tools/timeline.py
@@ -186,6 +186,13 @@ def _allocate_pids(self):
                         self._chrome_trace.emit_pid(
                             "memory usage on %s:cudapinnedplace:%d" %
                             (k, mevent.device_id), pid)
+                elif mevent.place == profiler_pb2.MemEvent.NPUPlace:
+                    if (k, mevent.device_id, "NPU") not in self._mem_devices:
+                        pid = self._allocate_pid()
+                        self._mem_devices[(k, mevent.device_id, "NPU")] = pid
+                        self._chrome_trace.emit_pid(
+                            "memory usage on %s:npu:%d" % (k, mevent.device_id),
+                            pid)
                 if (k, 0, "CPU") not in self._mem_devices:
                     pid = self._allocate_pid()
                     self._mem_devices[(k, 0, "CPU")] = pid
@@ -201,6 +208,11 @@ def _allocate_pids(self):
                     self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
                     self._chrome_trace.emit_pid(
                         "memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
+                if (k, 0, "NPU") not in self._mem_devices:
+                    pid = self._allocate_pid()
+                    self._mem_devices[(k, 0, "NPU")] = pid
+                    self._chrome_trace.emit_pid("memory usage on %s:npu:%d" %
+                                                (k, 0), pid)
 
     def _allocate_events(self):
         for k, profile_pb in six.iteritems(self._profile_dict):
@@ -227,7 +239,8 @@ def _allocate_memory_event(self):
         place_to_str = {
             profiler_pb2.MemEvent.CPUPlace: "CPU",
             profiler_pb2.MemEvent.CUDAPlace: "GPU",
-            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
+            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace",
+            profiler_pb2.MemEvent.NPUPlace: "NPU"
         }
         for k, profile_pb in six.iteritems(self._profile_dict):
             mem_list = []
diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat
index 736a19ddf52f46..4a61a99c34fa24 100644
--- a/tools/windows/build_compile_environment.bat
+++ b/tools/windows/build_compile_environment.bat
@@ -22,9 +22,9 @@
 ::   Include:
 ::     1. CMake 3.17.0
 ::     2. Git 2.28.0
-::     3. Python 3.7.8
-::     4. Visual Studio 2015 with update 3
-::     5. CUDA 10
+::     3. Python 3.8.3
+::     4. Visual Studio 2017 Community
+::     5. CUDA 11.2
 ::     6. java jre
 ::     7. xly agent
 
@@ -73,7 +73,6 @@ if %errorlevel% == 0 (
 ) else (
   echo Error***** Install Cmake-3.17.0 failed, please re-install it manually.
 )
-del cmake-3.17.0-win64-x64.msi
 goto :eof
 :: ===== end step 1: cmake =====
 
@@ -99,91 +98,87 @@ if %errorlevel% == 0 (
 ) else (
   echo Error***** Install Git-2.28.0 failed, please re-install it manually.
 )
-del Git-2.28.0-64-bit.exe
 goto :eof
 :: ===== end step 2: Git =====
 
 :: ===== start step 3: Python =====
-:: Download Python-3.7.8 and add in PATH when it not installed.
-:: TODO: limit version >= 3.7.8
+:: Download Python-3.8.3 and add in PATH when it not installed.
+:: TODO: limit version >= 3.8.3
 :python
-echo ">>>>>>>> step [3/7]: Python 3.7.8"
-python -V 2>&1 | findstr /C:"Python 3.7.8" > nul 2> nul || call :install_python
-goto vs2015
+echo ">>>>>>>> step [3/7]: Python 3.8.3"
+python -V 2>&1 | findstr /C:"Python 3.8.3" > nul 2> nul || call :install_python
+goto vs
 
 :install_python
-echo There is not Python in this PC, will install Python-3.7.8.
-echo Download package from https://npm.taobao.org/mirrors/python/3.7.8/python-3.7.8-amd64.exe ...
-wget -O python-3.7.8-amd64.exe https://npm.taobao.org/mirrors/python/3.7.8/python-3.7.8-amd64.exe
-echo Install Python-3.7.8 ...
+echo There is not Python in this PC, will install Python-3.8.3
+echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/python-3.8.3-amd64.exe ...
+wget -O python-3.8.3-amd64.exe https://paddle-ci.gz.bcebos.com/window_requirement/python-3.8.3-amd64.exe
+echo Install Python-3.8.3 ...
 :: /passive [silent install]
 :: InstallAllUsers [add path for all users]
 :: PrependPath [add script/install into PATH]
 :: TargetDir [install directory]
-start /wait python-3.7.8-amd64.exe /passive InstallAllUsers=1 PrependPath=1 TargetDir=C:\Python37
+start /wait python-3.8.3-amd64.exe /passive InstallAllUsers=1 PrependPath=1 TargetDir=C:\Python38
 if %errorlevel% == 0 (
-  echo Install python-3.7.8 success!
+  echo Install python-3.8.3 success!
 ) else (
-  echo Error***** Install python-3.7.8 failed, please re-install it manually.
+  echo Error***** Install python-3.8.3 failed, please re-install it manually.
 )
-del python-3.7.8-amd64.exe
 goto :eof
 :: ===== end step 3: Python =====
 
-:: ===== start step 4: Visual Studio 2015 =====
-:: Download Visual Studio 2015 when it not installed.
-:vs2015
-echo ">>>>>>>> step [4/7]: Visual Studio 2015"
-cmd /C "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 > nul 2> nul || call :install_visual_studio
+:: ===== start step 4: Visual Studio 2017 Community =====
+:: Download Visual Studio 2017 when it not installed.
+:vs
+echo ">>>>>>>> step [4/7]: Visual Studio 2017 "
+cmd /C "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"  > nul 2> nul || call :install_visual_studio
 goto :cuda10
 
 :install_visual_studio
-echo There is not Visual Studio in this PC, will install VS2015.
-echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/en_visual_studio_enterprise_2015_with_update_3_x86_x64_web_installer_8922986.exe"
-wget -O vs_installer.exe "https://paddle-ci.gz.bcebos.com/window_requirement/en_visual_studio_enterprise_2015_with_update_3_x86_x64_web_installer_8922986.exe"
-echo Install Visual Studio 2015 ...
+echo There is not Visual Studio in this PC, will install VS2017.
+echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/VS2017/vs_Community.exe"
+wget -O vs_Community.exe "https://paddle-ci.gz.bcebos.com/window_requirement/VS2017/vs_Community.exe"
+echo Install Visual Studio 2017 ...
 :: /passive [silent install]
 :: /norestart [no restart]
 :: /NoRefresh [no refresh]
 :: /InstallSelectableItems NativeLanguageSupport_Group [select Visual C++ for installing]
-start /wait vs_installer.exe /passive /norestart /NoRefresh /InstallSelectableItems NativeLanguageSupport_Group
+start /wait vs_Community.exe --passive --add Microsoft.VisualStudio.Workload.NativeDesktop --add Microsoft.VisualStudio.Workload.Universal --includeRecommended
 if %errorlevel% == 0 (
-  echo Install Visual Studio 2015 success!
+  echo Install Visual Studio 2017 success!
 ) else (
-  echo Error***** Install Visual Studio 2015 failed, please re-install it manually.
+  echo Error***** Install Visual Studio 2017 failed, please re-install it manually.
 )
-del vs_installer.exe
 goto :eof
-:: ===== end step 4: Visual Studio 2015 =====
+:: ===== end step 4: Visual Studio 2017 =====
 
-:: ===== start step 5: CUDA 10 =====
+:: ===== start step 5: CUDA 11 =====
 :cuda10
-echo ">>>>>>>> step [5/7]: CUDA 10.2"
-cmd /C nvcc --version 2> nul | findstr /C:"10.2" > nul 2> nul || call :install_cuda
+echo ">>>>>>>> step [5/7]: CUDA 11.2"
+cmd /C nvcc --version 2> nul | findstr /C:"11.2" > nul 2> nul || call :install_cuda
 goto java-jre
 
 :install_cuda
-echo There is not CUDA in this PC, will install CUDA-10.2.
-echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_10.2.89_441.22_win10.exe"
-wget -O cuda_installer.exe "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_10.2.89_441.22_win10.exe"
-echo Install CUDA-10.2 ...
+echo There is not CUDA in this PC, will install CUDA-11.2.
+echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_11.2.0_460.89_win10.exe"
+wget -O cuda_installer.exe "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_11.2.0_460.89_win10.exe"
+echo Install CUDA-11.2 ...
 :: -s [silent install]
 start /wait cuda_installer.exe -s
 if %errorlevel% == 0 (
-  echo Install CUDA-10.2 success!
+  echo Install CUDA-11.2 success!
 ) else (
-  echo Error***** Install CUDA-10.2 failed, please re-install it manually.
+  echo Error***** Install CUDA-11.2 failed, please re-install it manually.
   goto :eof
 )
 del cuda_installer.exe
-echo Download cudnn from "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-10.2-windows10-x64-v7.6.5.32.zip"
-wget -O cudnn-10.2-windows10-x64-v7.6.5.32.zip "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-10.2-windows10-x64-v7.6.5.32.zip"
-tar xf cudnn-10.2-windows10-x64-v7.6.5.32.zip
-xcopy /E /Y /R "cuda\bin\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin"
-xcopy /E /Y /R "cuda\include\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\include"
-xcopy /E /Y /R "cuda\lib\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\lib"
+echo Download cudnn from "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-11.2-windows-x64-v8.1.0.77.zip"
+wget -O cudnn-11.2-windows-x64-v8.1.0.77.zip "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-11.2-windows-x64-v8.1.0.77.zip"
+tar xf cudnn-11.2-windows-x64-v8.1.0.77.zip
+xcopy /E /Y /R "cuda\bin\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\bin"
+xcopy /E /Y /R "cuda\include\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\include"
+xcopy /E /Y /R "cuda\lib\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\lib"
 rd /s /q cuda
-del cudnn-10.2-windows10-x64-v7.6.5.32.zip
 goto :eof
 :: ===== end step 5: CUDA 10 =====
 
@@ -212,7 +207,7 @@ goto :eof
 :: ===== start step 7: xly agent =====
 :xly-agent
 echo ">>>>>>>> step [7/7]: xly agent"
-wget -O agent.jar "https://paddle-ci.gz.bcebos.com/window_requirement/agent.jar"
+wget -O agent.jar "https://xly.bce.baidu.com/sa_server/agent/v1/download?version=1.2.8"
 :: ===== end step 8: xly agent =====
 
 pause
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index 0aeea63d6ab283..d2cefcc441f6c2 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -47,7 +47,7 @@ if [ ${WITH_GPU:-OFF} == "ON" ];then
 fi
 
 
-# /*==================Fixed Disabled Windows unittests==============================*/
+# /*==================Fixed Disabled Windows GPU MKL unittests==============================*/
 # TODO: fix these unittest that is bound to fail
 diable_wingpu_test="^lite_mul_model_test$|\
 ^test_analyzer_int8_resnet50$|\
@@ -118,16 +118,32 @@ diable_wingpu_test="^lite_mul_model_test$|\
 ^diable_wingpu_test$"
 # /*============================================================================*/
 
+# /*==================Fixed Disabled Windows CPU OPENBLAS unittests==============================*/
+# TODO: fix these unittest that is bound to fail
+diable_wincpu_test="^jit_kernel_test$|\
+^test_analyzer_transformer$|\
+^test_vision_models$|\
+^test_dygraph_multi_forward$|\
+^test_imperative_transformer_sorted_gradient$|\
+^test_program_prune_backward$|\
+^test_imperative_resnet$|\
+^test_imperative_resnet_sorted_gradient$|\
+^test_imperative_se_resnext$|\
+^test_imperative_static_runner_mnist$|\
+^test_bmn$|\
+^test_mobile_net$|\
+^test_resnet_v2$|\
+^test_se_resnet$|\
+^diable_wincpu_test$"
+
 # these unittest that cost long time, diabled temporarily, Maybe moved to the night
 long_time_test="^best_fit_allocator_test$|\
-^test_image_classification$|\
 ^decorator_test$|\
 ^test_dataset_cifar$|\
 ^test_dataset_imdb$|\
 ^test_dataset_movielens$|\
 ^test_datasets$|\
 ^test_pretrained_model$|\
-^test_concat_op$|\
 ^test_elementwise_add_op$|\
 ^test_elementwise_sub_op$|\
 ^test_gather_op$|\
@@ -143,8 +159,6 @@ long_time_test="^best_fit_allocator_test$|\
 ^test_bicubic_interp_op$|\
 ^test_bicubic_interp_v2_op$|\
 ^test_bilinear_interp_v2_op$|\
-^test_conv2d_op$|\
-^test_conv3d_op$|
 ^test_conv3d_transpose_part2_op$|\
 ^test_conv_nn_grad$|\
 ^test_crop_tensor_op$|\
@@ -158,7 +172,6 @@ long_time_test="^best_fit_allocator_test$|\
 ^test_empty_op$|\
 ^test_fused_elemwise_activation_op$|\
 ^test_group_norm_op$|\
-^test_gru_op$|\
 ^test_gru_unit_op$|\
 ^test_imperative_lod_tensor_to_selected_rows$|\
 ^test_imperative_optimizer$|\
@@ -206,14 +219,8 @@ long_time_test="^best_fit_allocator_test$|\
 ^test_imperative_auto_mixed_precision$|\
 ^test_imperative_optimizer_v2$|\
 ^test_imperative_ptb_rnn_sorted_gradient$|\
-^test_imperative_save_load_v2$|\
-^test_nan_inf$|\
-^test_norm_op$|\
-^test_reduce_op$|\
 ^test_sigmoid_cross_entropy_with_logits_op$|\
-^test_stack_op$|\
-^test_strided_slice_op$|\
-^test_transpose_op$"
+^test_strided_slice_op$"
 
 if [ ${WITH_GPU:-OFF} == "ON" ];then
     export FLAGS_call_stack_level=2
@@ -267,7 +274,7 @@ function collect_failed_tests() {
 
 function run_unittest_cpu() {
     tmpfile=$tmp_dir/$RANDOM
-    (ctest -E "${disable_ut_quickly}" -LE "${nightly_label}" --output-on-failure -C Release -j 8 | tee $tmpfile) &
+    (ctest -E "$disable_ut_quickly|$diable_wincpu_test" -LE "${nightly_label}" --output-on-failure -C Release -j 8 | tee $tmpfile) &
     wait;
 }
 
diff --git a/tools/wlist.json b/tools/wlist.json
index cd9f2a7ca661e0..5a83a9ee47004a 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -34,6 +34,10 @@
             "name":"reshape_",
             "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
         },
+        {
+            "name":"flatten_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
         {
             "name":"scatter_",
             "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
@@ -53,6 +57,50 @@
         {
             "name":"tanh_",
             "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"ceil_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"floor_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"exp_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"reciprocal_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"round_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"sqrt_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"rsqrt_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"clip_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"scale_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"subtract_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
+        },
+        {
+            "name":"add_",
+            "annotation":"Inplace APIs don't need sample code. There is a special document introducing Inplace strategy"
         }
     ],
     "wlist_temp_api":[